1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 /* 23 * The PRM for this family of parts is freely available, and can be found at: 24 * https://www.mellanox.com/related-docs/user_manuals/ \ 25 * Ethernet_Adapters_Programming_Manual.pdf 26 */ 27 /* 28 * ConnectX glossary 29 * ----------------- 30 * 31 * WR Work Request: something we've asked the hardware to do by 32 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 33 * 34 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 35 * 36 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 37 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 38 * types have different WQE structures, different commands for 39 * creating and destroying them, etc, but share a common context 40 * structure, counter setup and state graph. 41 * SQ Send Queue, a specific type of WQ that sends packets 42 * RQ Receive Queue, a specific type of WQ that receives packets 43 * 44 * CQ Completion Queue: completion of WRs from a WQ are reported to 45 * one of these, as a CQE on its entry ring. 46 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 47 * info, as well as packet size, the ID of the WQ, and the index 48 * of the WQE which completed. Does not contain any packet data. 49 * 50 * EQ Event Queue: a ring of event structs from the hardware informing 51 * us when particular events happen. Many events can point at a 52 * a particular CQ which we should then go look at. 53 * EQE Event Queue Entry: an entry on the EQ ring 54 * 55 * UAR User Access Region, a page of the device's PCI BAR which is 56 * tied to particular EQ/CQ/WQ sets and contains doorbells to 57 * ring to arm them for interrupts or wake them up for new work 58 * 59 * RQT RQ Table, a collection of indexed RQs used to refer to the group 60 * as a single unit (for e.g. hashing/RSS). 61 * 62 * TIR Transport Interface Recieve, a bucket of resources for the 63 * reception of packets. TIRs have to point at either a single RQ 64 * or a table of RQs (RQT). They then serve as a target for flow 65 * table entries (FEs). TIRs that point at an RQT also contain the 66 * settings for hashing for RSS. 67 * 68 * TIS Transport Interface Send, a bucket of resources associated with 69 * the transmission of packets. In particular, the temporary 70 * resources used for LSO internally in the card are accounted to 71 * a TIS. 72 * 73 * FT Flow Table, a collection of FEs and FGs that can be referred to 74 * as a single entity (e.g. used as a target from another flow 75 * entry or set as the "root" table to handle incoming or outgoing 76 * packets). Packets arriving at a FT are matched against the 77 * FEs in the table until either one matches with a terminating 78 * action or all FEs are exhausted (it's first-match-wins but with 79 * some actions that are non-terminal, like counting actions). 80 * 81 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 82 * they match on the same attributes of packets coming into the 83 * flow). 84 * 85 * FE Flow Entry, an individual set of values to match against 86 * packets entering the flow table, combined with an action to 87 * take upon a successful match. The action we use most is 88 * "forward", which sends the packets to a TIR or another flow 89 * table and then stops further processing within the FE's FT. 90 * 91 * lkey/mkey A reference to something similar to a page table but in the 92 * device's internal onboard MMU. Since Connect-X parts double as 93 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 94 * features which we try very hard not to use. For our WQEs we use 95 * the "reserved" lkey, which is a special value which indicates 96 * that addresses we give are linear addresses and should not be 97 * translated. 98 * 99 * PD Protection Domain, an IB concept. We have to allocate one to 100 * provide as a parameter for new WQs, but we don't do anything 101 * with it. 102 * 103 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 104 * provide it as a parameter to TIR/TIS creation, but we don't do 105 * anything with it. 106 */ 107 /* 108 * 109 * Data flow overview 110 * ------------------ 111 * 112 * This driver is a MAC ring-enabled driver which maps rings to send and recv 113 * queues in hardware on the device. 114 * 115 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 116 * sufficient space, and simplify the logic needed to work out which buffer 117 * was completed. 118 * 119 * The CQs are then round-robin allocated onto EQs, of which we set up one per 120 * interrupt that the system gives us for the device. Normally this means we 121 * have 8 EQs. 122 * 123 * When we have >= 8 EQs available, we try to allocate only RX or only TX 124 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 125 * 126 * EQ #0 is reserved for all event types other than completion events, and has 127 * no CQs associated with it at any time. EQs #1 and upwards are only used for 128 * handling CQ completion events. 129 * 130 * +------+ +------+ +------+ +---------+ 131 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 132 * +------+ +------+ | +------+ +---------+ 133 * | 134 * +------+ +------+ | 135 * | SQ 1 |---->| CQ 1 |---+ | +------+ 136 * +------+ +------+ | +---> | | 137 * | | | 138 * +------+ +------+ | | EQ 1 | +---------+ 139 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 140 * +------+ +------+ | +---> | | +---------+ 141 * | | +------+ 142 * | | 143 * ... | | 144 * | | +------+ 145 * +------+ +------+ +-----> | | 146 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 147 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 148 * | | | +---------+ 149 * +------+ +------+ | +-> | | 150 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 151 * +------+ +------+ | 152 * | .... 153 * +------+ +------+ | 154 * | RQ 2 |---->| CQ 5 |-------+ 155 * +------+ +------+ 156 * 157 * ... (note this diagram does not show RX-only or TX-only EQs) 158 * 159 * For TX, we advertise all of the SQs we create as plain rings to MAC with 160 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 161 * and use the rings as it sees fit. 162 * 163 * For RX, we advertise actual groups in order to make use of hardware 164 * classification. 165 * 166 * The hardware classification we use is based around Flow Tables, and we 167 * currently ignore all of the eswitch features of the card. The NIC VPORT 168 * is always set to promisc mode so that the eswitch sends us all of the 169 * traffic that arrives on the NIC, and we use flow entries to manage 170 * everything. 171 * 172 * We use 2 layers of flow tables for classification: traffic arrives at the 173 * root RX flow table which contains MAC address filters. Those then send 174 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 175 * presence and VID filters. 176 * 177 * Since these parts only support doing RSS hashing on a single protocol at a 178 * time, we have to use a third layer of flow tables as well to break traffic 179 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 180 * so that it can be sent to the appropriate TIR for hashing. 181 * 182 * Incoming packets 183 * + +---------+ +---------+ 184 * | +->| group 0 | | group 0 | 185 * | | | vlan ft | +-->| hash ft | 186 * v | | L1 | | | L2 | 187 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 188 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 189 * +----+----+ | | | | +---------+ +-----+ | +------+ 190 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 191 * | | | | | +---------+ +-----+ | +------+ 192 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 193 * v | | | | +---------+ +-----+ | RQT +------+ 194 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 195 * | root rx | | | default |--+ +---------+ +-----+ | | | 196 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 197 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 198 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 199 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 200 * +---------+ | ^ | other |-+ 201 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 202 * +---------+ | +->| TIR |--->| RQ0 | 203 * | MAC 1 |-+ | +-----+ +-----+ 204 * +---------+ | +---------------+ 205 * | MAC 2 |-+ | ^ 206 * +---------+ | | | 207 * | MAC 3 |-+ | +---------+ | +---------+ 208 * +---------+ | | | group 1 | | | group 1 | 209 * | ..... | +--->| vlan ft | | +>| hash ft | 210 * | | | | L1 | | | | L2 | 211 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 212 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 213 * +---------+ +---------+ | +---------+ +-----+ | +------+ 214 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 215 * | | | +---------+ +-----+ | +------+ 216 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 217 * | | | +---------+ +-----+ | RQT +------+ 218 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 219 * | | | +---------+ +-----+ | | | 220 * +---------+ | | IPv6 |--->| TIR |--->| | | 221 * | promisc |--+ +---------+ +-----+ | | | 222 * +---------+ | IPv4 |--->| TIR |--->| | | 223 * +---------+ +-----+ +-----+------+ 224 * | other |-+ 225 * +---------+ | 226 * ....... | +-----+ +-----+ 227 * +->| TIR |--->| RQ3 | 228 * +-----+ +-----+ 229 * 230 * Note that the "promisc" flow entries are only set/enabled when promisc 231 * mode is enabled for the NIC. All promisc flow entries point directly at 232 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 233 * the "default group" in MAC). 234 * 235 * The "default" entry in the L1 VLAN filter flow tables is used when there 236 * are no VLANs set for the group, to accept any traffic regardless of tag. It 237 * is deleted as soon as a VLAN filter is added (and re-instated if the 238 * last VLAN filter is removed). 239 * 240 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 241 * space for packet data (they're a collection of scatter pointers only). TX 242 * descriptors contain some space for "inline headers" (and the card requires 243 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 244 * but all the rest of the data comes from the gather pointers. 245 * 246 * When we get completions back they simply contain the ring index number of 247 * the WR (work request) which completed. So, we manage the buffers for actual 248 * packet data completely independently of the descriptors in this driver. When 249 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 250 * with the WQE index that we put it at, and therefore don't have to look at 251 * the original descriptor at all when handling completions. 252 * 253 * For RX, we create sufficient packet data buffers to fill 150% of the 254 * available descriptors for each ring. These all are pre-set-up for DMA and 255 * have an mblk_t associated with them (with desballoc()). 256 * 257 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 258 * large enough), or we copy it into a pre-allocated buffer set up in the same 259 * as as for RX. 260 */ 261 262 /* 263 * Buffer lifecycle: RX 264 * -------------------- 265 * 266 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 267 * straightforward. 268 * 269 * It is created (and has all its memory allocated) at the time of starting up 270 * the RX ring it belongs to. Then it is placed on the "free" list in the 271 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 272 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 273 * before making a WQE for it. 274 * 275 * After a completion event occurs, the packet is either discarded (and the 276 * buffer_t returned to the free list), or it is readied for loaning to MAC 277 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 278 * 279 * Once MAC and the rest of the system have finished with the packet, they call 280 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 281 * the fate of the buffer_t is determined by the state of the 282 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 283 * will be returned to the free list, potentially to be recycled and used 284 * again. But if the shard is draining (E.g. after a ring stop) there will be 285 * no recycling and the buffer_t is immediately destroyed. 286 * 287 * At detach/teardown time, buffers are only every destroyed from the free list. 288 * 289 * 290 * + 291 * | 292 * | mlxcx_buf_create 293 * | 294 * v 295 * +----+----+ 296 * | created | 297 * +----+----+ +------+ 298 * | | dead | 299 * | +------+ 300 * | mlxcx_buf_return ^ 301 * | | 302 * v | mlxcx_buf_destroy 303 * mlxcx_buf_destroy +----+----+ +-----------+ | 304 * +---------| free |<------no-| draining? |-yes-+ 305 * | +----+----+ +-----------+ 306 * | | ^ 307 * | | | 308 * v | mlxcx_buf_take | mlxcx_buf_return 309 * +---+--+ v | 310 * | dead | +---+---+ | 311 * +------+ | on WQ |- - - - - - - - >O 312 * +---+---+ ^ 313 * | | 314 * | | 315 * | mlxcx_buf_loan | mlxcx_buf_mp_return 316 * v | 317 * +-------+--------+ | 318 * | on loan to MAC |----------->O 319 * +----------------+ freemsg() 320 * 321 */ 322 323 /* 324 * Buffer lifecycle: TX 325 * -------------------- 326 * 327 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 328 * "foreign" buffers. 329 * 330 * The former have their memory allocated and DMA bound by this driver, while 331 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 332 * not owned by us, though we do DMA bind it (and take responsibility for 333 * un-binding it when we're done with them). 334 * 335 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 336 * SQ. Thus, there is a separate free list and mutex for each kind. 337 * 338 * Since a TX packet might consist of multiple mblks, we translate each mblk 339 * into exactly one buffer_t. The buffer_ts are chained together in the same 340 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 341 * 342 * Each chain of TX buffers may consist of foreign or driver buffers, in any 343 * mixture. 344 * 345 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 346 * it from the rest of the chain buffers. 347 * 348 * TX buffer chains are always returned to the free list by 349 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 350 * freeing all of the members. 351 * 352 * We only call freemsg() once, on the head of the TX buffer chain's original 353 * mblk. This is true whether we copied it or bound it in a foreign buffer. 354 */ 355 356 /* 357 * Startup and command interface 358 * ----------------------------- 359 * 360 * The command interface is the primary way in which we give control orders to 361 * the hardware (e.g. actions like "create this queue" or "delete this flow 362 * entry"). The command interface is never used to transmit or receive packets 363 * -- that takes place only on the queues that are set up through it. 364 * 365 * In mlxcx_cmd.c we implement our use of the command interface on top of a 366 * simple taskq. As commands are submitted from the taskq they choose a 367 * "slot", if there are no free slots then execution of the command will 368 * be paused until one is free. The hardware permits up to 32 independent 369 * slots for concurrent command execution. 370 * 371 * Before interrupts are enabled, command completion is polled, once 372 * interrupts are up command completions become asynchronous and are 373 * wired to EQ 0. A caveat to this is commands can not be submitted 374 * directly from EQ 0's completion handler, and any processing resulting from 375 * an asynchronous event which requires further use of the command interface 376 * is posted through a taskq. 377 * 378 * The startup/attach process for this card involves a bunch of different steps 379 * which are summarised pretty well in the PRM. We have to send a number of 380 * commands which do different things to start the card up, give it some pages 381 * of our own memory for it to use, then start creating all the entities that 382 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 383 * and TDoms. 384 */ 385 386 /* 387 * UARs 388 * ---- 389 * 390 * The pages of the PCI BAR other than the first few are reserved for use as 391 * "UAR" sections in this device. Each UAR section can be used as a set of 392 * doorbells for our queues. 393 * 394 * Currently we just make one single UAR for all of our queues. It doesn't 395 * seem to be a major limitation yet. 396 * 397 * When we're sending packets through an SQ, the PRM is not awful clear about 398 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 399 * (it's clear on the pattern of alternation you're expected to use between 400 * even and odd for Blueflame sends, but not for regular doorbells). 401 * 402 * Currently we don't do the even-odd alternating pattern for ordinary 403 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 404 * least on Connect-X4 Lx. 405 */ 406 407 /* 408 * Lock ordering 409 * ------------- 410 * 411 * Interrupt side: 412 * 413 * - mleq_mtx 414 * - mlcq_arm_mtx 415 * - mlcq_mtx 416 * - mlcq_bufbmtx 417 * - mlwq_mtx 418 * - mlbs_mtx 419 * - mlp_mtx 420 * 421 * GLD side: 422 * 423 * - mlp_mtx 424 * - mlg_mtx 425 * - mlg_*.mlft_mtx 426 * - mlp_*.mlft_mtx 427 * - mlwq_mtx 428 * - mlbs_mtx 429 * - mlcq_bufbmtx 430 * - mleq_mtx 431 * - mlcq_arm_mtx 432 * - mlcq_mtx 433 * 434 */ 435 436 #include <sys/modctl.h> 437 #include <sys/conf.h> 438 #include <sys/devops.h> 439 #include <sys/sysmacros.h> 440 #include <sys/time.h> 441 442 #include <sys/mac_provider.h> 443 444 #include <mlxcx.h> 445 446 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 447 448 #define MLXCX_MODULE_NAME "mlxcx" 449 /* 450 * We give this to the firmware, so it has to be in a fixed format that it 451 * understands. 452 */ 453 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 454 455 /* 456 * Firmware may take a while to reclaim pages. Try a set number of times. 457 */ 458 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 459 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 460 461 static void *mlxcx_softstate; 462 463 /* 464 * Fault detection thresholds. 465 */ 466 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 467 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 468 469 static void 470 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 471 { 472 mlxcx_drv_props_t *p = &mlxp->mlx_props; 473 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 474 475 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 476 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 477 478 /* 479 * Currently we have different queue size defaults for two 480 * categories of queues. One set for devices which support a 481 * maximum speed of 10Gb/s, and another for those above that. 482 */ 483 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 484 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) { 485 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 486 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 487 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 488 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 489 MLXCX_PROTO_10G)) != 0) { 490 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 491 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 492 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 493 } else { 494 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 495 "recognize. Proto: 0x%x", port->mlp_max_proto); 496 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 497 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 498 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 499 } 500 } 501 502 /* 503 * Properties which may have different defaults based on hardware 504 * characteristics. 505 */ 506 static void 507 mlxcx_load_model_props(mlxcx_t *mlxp) 508 { 509 mlxcx_drv_props_t *p = &mlxp->mlx_props; 510 511 mlxcx_load_prop_defaults(mlxp); 512 513 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 514 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 515 p->mldp_cq_size_shift_default); 516 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 517 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 518 p->mldp_sq_size_shift_default); 519 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 520 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 521 p->mldp_rq_size_shift_default); 522 } 523 524 static void 525 mlxcx_load_props(mlxcx_t *mlxp) 526 { 527 mlxcx_drv_props_t *p = &mlxp->mlx_props; 528 529 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 530 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 531 MLXCX_EQ_SIZE_SHIFT_DFLT); 532 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 533 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 534 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 535 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 536 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 537 MLXCX_CQEMOD_COUNT_DFLT); 538 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 540 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 541 542 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 543 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 544 MLXCX_TX_NGROUPS_DFLT); 545 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 546 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 547 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 548 549 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 550 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 551 MLXCX_RX_NGROUPS_LARGE_DFLT); 552 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 553 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 554 MLXCX_RX_NGROUPS_SMALL_DFLT); 555 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 556 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 557 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 558 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 559 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 560 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 561 562 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 563 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 564 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 565 566 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 567 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 568 MLXCX_TX_BIND_THRESHOLD_DFLT); 569 570 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 571 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 572 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 573 574 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 575 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 576 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 577 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 578 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 579 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 580 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 581 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 582 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 583 584 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 585 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 586 MLXCX_RX_PER_CQ_DEFAULT); 587 588 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 589 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 590 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 591 "out of range. Defaulting to: %d. Valid values are from " 592 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 593 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 594 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 595 } 596 } 597 598 void 599 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 600 { 601 va_list ap; 602 603 va_start(ap, fmt); 604 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 605 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 606 } else { 607 vcmn_err(CE_NOTE, fmt, ap); 608 } 609 va_end(ap); 610 } 611 612 void 613 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 614 { 615 va_list ap; 616 617 va_start(ap, fmt); 618 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 619 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 620 } else { 621 vcmn_err(CE_WARN, fmt, ap); 622 } 623 va_end(ap); 624 } 625 626 void 627 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 628 { 629 va_list ap; 630 631 va_start(ap, fmt); 632 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 633 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 634 } else { 635 vcmn_err(CE_PANIC, fmt, ap); 636 } 637 va_end(ap); 638 } 639 640 uint16_t 641 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 642 { 643 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 644 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 645 } 646 647 uint32_t 648 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 649 { 650 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 651 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 652 } 653 654 uint64_t 655 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 656 { 657 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 658 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 659 } 660 661 void 662 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 663 { 664 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 665 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 666 } 667 668 void 669 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 670 { 671 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 672 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 673 } 674 675 void 676 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 677 { 678 /* 679 * The UAR is always inside the first BAR, which we mapped as 680 * mlx_regs 681 */ 682 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 683 (uintptr_t)mlxp->mlx_regs_base; 684 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 685 } 686 687 void 688 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 689 { 690 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 691 (uintptr_t)mlxp->mlx_regs_base; 692 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 693 } 694 695 static void 696 mlxcx_fm_fini(mlxcx_t *mlxp) 697 { 698 if (mlxp->mlx_fm_caps == 0) 699 return; 700 701 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 702 ddi_fm_handler_unregister(mlxp->mlx_dip); 703 704 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 705 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 706 pci_ereport_teardown(mlxp->mlx_dip); 707 708 ddi_fm_fini(mlxp->mlx_dip); 709 710 mlxp->mlx_fm_caps = 0; 711 } 712 713 void 714 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 715 { 716 uint64_t ena; 717 char buf[FM_MAX_CLASS]; 718 719 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 720 return; 721 722 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 723 ena = fm_ena_generate(0, FM_ENA_FMT1); 724 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 725 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 726 NULL); 727 } 728 729 static int 730 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 731 { 732 /* 733 * as the driver can always deal with an error in any dma or 734 * access handle, we can just return the fme_status value. 735 */ 736 pci_ereport_post(dip, err, NULL); 737 return (err->fme_status); 738 } 739 740 static void 741 mlxcx_fm_init(mlxcx_t *mlxp) 742 { 743 ddi_iblock_cookie_t iblk; 744 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 745 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 746 747 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 748 DDI_PROP_DONTPASS, "fm_capable", def); 749 750 if (mlxp->mlx_fm_caps < 0) { 751 mlxp->mlx_fm_caps = 0; 752 } 753 mlxp->mlx_fm_caps &= def; 754 755 if (mlxp->mlx_fm_caps == 0) 756 return; 757 758 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 759 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 760 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 761 pci_ereport_setup(mlxp->mlx_dip); 762 } 763 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 764 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 765 (void *)mlxp); 766 } 767 } 768 769 static void 770 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 771 { 772 mlxcx_buffer_t *buf; 773 774 mutex_enter(&s->mlbs_mtx); 775 776 while (!list_is_empty(&s->mlbs_busy)) 777 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 778 779 while (!list_is_empty(&s->mlbs_loaned)) 780 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 781 782 while ((buf = list_head(&s->mlbs_free)) != NULL) 783 mlxcx_buf_destroy(mlxp, buf); 784 785 list_destroy(&s->mlbs_free); 786 list_destroy(&s->mlbs_busy); 787 list_destroy(&s->mlbs_loaned); 788 mutex_exit(&s->mlbs_mtx); 789 790 cv_destroy(&s->mlbs_free_nonempty); 791 mutex_destroy(&s->mlbs_mtx); 792 } 793 794 static void 795 mlxcx_teardown_bufs(mlxcx_t *mlxp) 796 { 797 mlxcx_buf_shard_t *s; 798 799 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 800 mlxcx_mlbs_teardown(mlxp, s); 801 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 802 } 803 list_destroy(&mlxp->mlx_buf_shards); 804 805 kmem_cache_destroy(mlxp->mlx_bufs_cache); 806 } 807 808 static void 809 mlxcx_teardown_pages(mlxcx_t *mlxp) 810 { 811 uint_t nzeros = 0; 812 uint64_t *pas; 813 814 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES, 815 KM_SLEEP); 816 817 mutex_enter(&mlxp->mlx_pagemtx); 818 819 while (mlxp->mlx_npages > 0) { 820 int32_t req, ret; 821 822 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 823 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 824 825 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 826 mlxcx_warn(mlxp, "hardware refused to return pages, " 827 "leaking %u remaining pages", mlxp->mlx_npages); 828 goto out; 829 } 830 831 for (int32_t i = 0; i < ret; i++) { 832 mlxcx_dev_page_t *mdp, probe; 833 bzero(&probe, sizeof (probe)); 834 probe.mxdp_pa = pas[i]; 835 836 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 837 838 if (mdp != NULL) { 839 avl_remove(&mlxp->mlx_pages, mdp); 840 mlxp->mlx_npages--; 841 mlxcx_dma_free(&mdp->mxdp_dma); 842 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 843 } else { 844 mlxcx_panic(mlxp, "hardware returned a page " 845 "with PA 0x%" PRIx64 " but we have no " 846 "record of giving out such a page", pas[i]); 847 } 848 } 849 850 /* 851 * If no pages were returned, note that fact. 852 */ 853 if (ret == 0) { 854 nzeros++; 855 if (nzeros > mlxcx_reclaim_tries) { 856 mlxcx_warn(mlxp, "hardware refused to return " 857 "pages, leaking %u remaining pages", 858 mlxp->mlx_npages); 859 goto out; 860 } 861 delay(drv_usectohz(mlxcx_reclaim_delay)); 862 } 863 } 864 865 avl_destroy(&mlxp->mlx_pages); 866 867 out: 868 mutex_exit(&mlxp->mlx_pagemtx); 869 mutex_destroy(&mlxp->mlx_pagemtx); 870 871 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES); 872 } 873 874 static boolean_t 875 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 876 { 877 ddi_device_acc_attr_t acc; 878 ddi_dma_attr_t attr; 879 boolean_t ret; 880 size_t sz, i; 881 882 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 883 884 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 885 mleq->mleq_nents = (1 << mleq->mleq_entshift); 886 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 887 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 888 889 mlxcx_dma_acc_attr(mlxp, &acc); 890 mlxcx_dma_queue_attr(mlxp, &attr); 891 892 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 893 B_TRUE, sz, B_TRUE); 894 if (!ret) { 895 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 896 return (B_FALSE); 897 } 898 899 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 900 901 for (i = 0; i < mleq->mleq_nents; ++i) 902 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 903 904 mleq->mleq_state |= MLXCX_EQ_ALLOC; 905 906 return (B_TRUE); 907 } 908 909 static void 910 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 911 { 912 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 913 if (mleq->mleq_state & MLXCX_EQ_CREATED) 914 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 915 916 mlxcx_dma_free(&mleq->mleq_dma); 917 mleq->mleq_ent = NULL; 918 919 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 920 } 921 922 void 923 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 924 { 925 mlxcx_flow_group_t *fg; 926 mlxcx_flow_entry_t *fe; 927 int i; 928 929 ASSERT(mutex_owned(&ft->mlft_mtx)); 930 931 for (i = ft->mlft_nents - 1; i >= 0; --i) { 932 fe = &ft->mlft_ent[i]; 933 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 934 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 935 mlxcx_panic(mlxp, "failed to delete flow " 936 "entry %u on table %u", i, 937 ft->mlft_num); 938 } 939 } 940 } 941 942 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 943 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 944 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 945 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 946 mlxcx_panic(mlxp, "failed to destroy flow " 947 "group %u", fg->mlfg_num); 948 } 949 } 950 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 951 } 952 list_destroy(&ft->mlft_groups); 953 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 954 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 955 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 956 mlxcx_panic(mlxp, "failed to destroy flow table %u", 957 ft->mlft_num); 958 } 959 } 960 kmem_free(ft->mlft_ent, ft->mlft_entsize); 961 ft->mlft_ent = NULL; 962 mutex_exit(&ft->mlft_mtx); 963 mutex_destroy(&ft->mlft_mtx); 964 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 965 } 966 967 static void 968 mlxcx_teardown_ports(mlxcx_t *mlxp) 969 { 970 uint_t i; 971 mlxcx_port_t *p; 972 mlxcx_flow_table_t *ft; 973 974 for (i = 0; i < mlxp->mlx_nports; ++i) { 975 p = &mlxp->mlx_ports[i]; 976 if (!(p->mlp_init & MLXCX_PORT_INIT)) 977 continue; 978 mutex_enter(&p->mlp_mtx); 979 if ((ft = p->mlp_rx_flow) != NULL) { 980 mutex_enter(&ft->mlft_mtx); 981 /* 982 * teardown_flow_table() will destroy the mutex, so 983 * we don't release it here. 984 */ 985 mlxcx_teardown_flow_table(mlxp, ft); 986 } 987 mutex_exit(&p->mlp_mtx); 988 mutex_destroy(&p->mlp_mtx); 989 mutex_destroy(&p->mlx_port_event.mla_mtx); 990 p->mlx_port_event.mla_mlx = NULL; 991 p->mlx_port_event.mla_port = NULL; 992 p->mlp_init &= ~MLXCX_PORT_INIT; 993 } 994 995 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 996 mlxp->mlx_ports = NULL; 997 } 998 999 static void 1000 mlxcx_teardown_wqs(mlxcx_t *mlxp) 1001 { 1002 mlxcx_work_queue_t *mlwq; 1003 1004 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 1005 mlxcx_wq_teardown(mlxp, mlwq); 1006 } 1007 list_destroy(&mlxp->mlx_wqs); 1008 } 1009 1010 static void 1011 mlxcx_teardown_cqs(mlxcx_t *mlxp) 1012 { 1013 mlxcx_completion_queue_t *mlcq; 1014 1015 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1016 mlxcx_cq_teardown(mlxp, mlcq); 1017 } 1018 list_destroy(&mlxp->mlx_cqs); 1019 } 1020 1021 static void 1022 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1023 { 1024 mlxcx_event_queue_t *mleq; 1025 uint_t i; 1026 1027 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1028 mleq = &mlxp->mlx_eqs[i]; 1029 mutex_enter(&mleq->mleq_mtx); 1030 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1031 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1032 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1033 mlxcx_warn(mlxp, "failed to destroy " 1034 "event queue idx %u eqn %u", 1035 i, mleq->mleq_num); 1036 } 1037 } 1038 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1039 mlxcx_eq_rele_dma(mlxp, mleq); 1040 } 1041 mutex_exit(&mleq->mleq_mtx); 1042 } 1043 } 1044 1045 static void 1046 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1047 { 1048 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1049 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1050 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1051 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1052 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1053 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1054 } 1055 1056 static void 1057 mlxcx_teardown(mlxcx_t *mlxp) 1058 { 1059 uint_t i; 1060 dev_info_t *dip = mlxp->mlx_dip; 1061 1062 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1063 /* 1064 * Disable interrupts and let any active vectors quiesce. 1065 */ 1066 mlxcx_intr_disable(mlxp); 1067 } 1068 1069 if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) { 1070 mlxcx_teardown_sensors(mlxp); 1071 mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS; 1072 } 1073 1074 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1075 mlxcx_teardown_checktimers(mlxp); 1076 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1077 } 1078 1079 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1080 mlxcx_teardown_groups(mlxp); 1081 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1082 } 1083 1084 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1085 mlxcx_teardown_wqs(mlxp); 1086 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1087 } 1088 1089 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1090 mlxcx_teardown_cqs(mlxp); 1091 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1092 } 1093 1094 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1095 mlxcx_teardown_bufs(mlxp); 1096 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1097 } 1098 1099 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1100 mlxcx_teardown_ports(mlxp); 1101 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1102 } 1103 1104 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1105 mlxcx_teardown_eqs(mlxp); 1106 mlxcx_intr_teardown(mlxp); 1107 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1108 } 1109 1110 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1111 if (mlxp->mlx_uar.mlu_allocated) { 1112 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1113 mlxcx_warn(mlxp, "failed to release UAR"); 1114 } 1115 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1116 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1117 } 1118 if (mlxp->mlx_pd.mlpd_allocated && 1119 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1120 mlxcx_warn(mlxp, "failed to release PD"); 1121 } 1122 if (mlxp->mlx_tdom.mltd_allocated && 1123 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1124 mlxcx_warn(mlxp, "failed to release TDOM"); 1125 } 1126 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1127 } 1128 1129 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1130 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1131 mlxcx_warn(mlxp, "failed to send teardown HCA " 1132 "command during device detach"); 1133 } 1134 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1135 } 1136 1137 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1138 mlxcx_teardown_pages(mlxp); 1139 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1140 } 1141 1142 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) { 1143 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 1144 mlxp->mlx_npages_req[i].mla_mlx = NULL; 1145 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx); 1146 } 1147 taskq_destroy(mlxp->mlx_async_tq); 1148 mlxp->mlx_async_tq = NULL; 1149 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ; 1150 } 1151 1152 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1153 if (!mlxcx_cmd_disable_hca(mlxp)) { 1154 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1155 "during device detach"); 1156 } 1157 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1158 } 1159 1160 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1161 mlxcx_cmd_queue_fini(mlxp); 1162 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1163 } 1164 1165 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1166 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1167 mlxp->mlx_caps = NULL; 1168 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1169 } 1170 1171 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1172 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1173 mlxp->mlx_regs_handle = NULL; 1174 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1175 } 1176 1177 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1178 pci_config_teardown(&mlxp->mlx_cfg_handle); 1179 mlxp->mlx_cfg_handle = NULL; 1180 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1181 } 1182 1183 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1184 mlxcx_fm_fini(mlxp); 1185 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1186 } 1187 1188 VERIFY3S(mlxp->mlx_attach, ==, 0); 1189 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1190 ddi_set_driver_private(dip, NULL); 1191 } 1192 1193 static boolean_t 1194 mlxcx_regs_map(mlxcx_t *mlxp) 1195 { 1196 off_t memsize; 1197 int ret; 1198 ddi_device_acc_attr_t da; 1199 1200 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1201 DDI_SUCCESS) { 1202 mlxcx_warn(mlxp, "failed to get register set size"); 1203 return (B_FALSE); 1204 } 1205 1206 /* 1207 * All data in the main BAR is kept in big-endian even though it's a PCI 1208 * device. 1209 */ 1210 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1211 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1212 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1213 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1214 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1215 da.devacc_attr_access = DDI_FLAGERR_ACC; 1216 } else { 1217 da.devacc_attr_access = DDI_DEFAULT_ACC; 1218 } 1219 1220 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1221 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1222 1223 if (ret != DDI_SUCCESS) { 1224 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1225 return (B_FALSE); 1226 } 1227 1228 return (B_TRUE); 1229 } 1230 1231 static boolean_t 1232 mlxcx_check_issi(mlxcx_t *mlxp) 1233 { 1234 uint32_t issi; 1235 1236 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1237 mlxcx_warn(mlxp, "failed to get ISSI"); 1238 return (B_FALSE); 1239 } 1240 1241 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1242 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1243 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1244 return (B_FALSE); 1245 } 1246 1247 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1248 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1249 MLXCX_CURRENT_ISSI); 1250 return (B_FALSE); 1251 } 1252 1253 return (B_TRUE); 1254 } 1255 1256 boolean_t 1257 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven) 1258 { 1259 ddi_device_acc_attr_t acc; 1260 ddi_dma_attr_t attr; 1261 int32_t i; 1262 list_t plist; 1263 mlxcx_dev_page_t *mdp; 1264 mlxcx_dev_page_t **pages; 1265 const ddi_dma_cookie_t *ck; 1266 1267 /* 1268 * If there are no pages required, then we're done here. 1269 */ 1270 if (npages <= 0) { 1271 *ngiven = 0; 1272 return (B_TRUE); 1273 } 1274 1275 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 1276 1277 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP); 1278 1279 list_create(&plist, sizeof (mlxcx_dev_page_t), 1280 offsetof(mlxcx_dev_page_t, mxdp_list)); 1281 1282 for (i = 0; i < npages; i++) { 1283 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1284 mlxcx_dma_acc_attr(mlxp, &acc); 1285 mlxcx_dma_page_attr(mlxp, &attr); 1286 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1287 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1288 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1289 npages); 1290 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1291 goto cleanup_npages; 1292 } 1293 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1294 mdp->mxdp_pa = ck->dmac_laddress; 1295 1296 list_insert_tail(&plist, mdp); 1297 } 1298 1299 /* 1300 * Now that all of the pages have been allocated, given them to hardware 1301 * in chunks. 1302 */ 1303 for (i = 0; i < npages; i++) { 1304 pages[i] = list_remove_head(&plist); 1305 } 1306 1307 if (!mlxcx_cmd_give_pages(mlxp, 1308 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 1309 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1310 "pages!", npages); 1311 for (i = 0; i < npages; i++) { 1312 list_insert_tail(&plist, pages[i]); 1313 } 1314 goto cleanup_npages; 1315 } 1316 1317 mutex_enter(&mlxp->mlx_pagemtx); 1318 for (i = 0; i < npages; i++) { 1319 avl_add(&mlxp->mlx_pages, pages[i]); 1320 } 1321 mlxp->mlx_npages += npages; 1322 mutex_exit(&mlxp->mlx_pagemtx); 1323 1324 list_destroy(&plist); 1325 kmem_free(pages, sizeof (*pages) * npages); 1326 1327 *ngiven = npages; 1328 1329 return (B_TRUE); 1330 1331 cleanup_npages: 1332 kmem_free(pages, sizeof (*pages) * npages); 1333 while ((mdp = list_remove_head(&plist)) != NULL) { 1334 mlxcx_dma_free(&mdp->mxdp_dma); 1335 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1336 } 1337 list_destroy(&plist); 1338 return (B_FALSE); 1339 } 1340 1341 static boolean_t 1342 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1343 { 1344 int32_t npages, given; 1345 1346 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1347 mlxcx_warn(mlxp, "failed to determine boot pages"); 1348 return (B_FALSE); 1349 } 1350 1351 while (npages > 0) { 1352 if (!mlxcx_give_pages(mlxp, npages, &given)) 1353 return (B_FALSE); 1354 1355 npages -= given; 1356 } 1357 1358 return (B_TRUE); 1359 } 1360 1361 static int 1362 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1363 { 1364 mlxcx_t *mlxp = cookie; 1365 mlxcx_buffer_t *b = arg; 1366 1367 bzero(b, sizeof (mlxcx_buffer_t)); 1368 b->mlb_mlx = mlxp; 1369 b->mlb_state = MLXCX_BUFFER_INIT; 1370 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1371 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1372 1373 return (0); 1374 } 1375 1376 static void 1377 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1378 { 1379 mlxcx_t *mlxp = cookie; 1380 mlxcx_buffer_t *b = arg; 1381 VERIFY3P(b->mlb_mlx, ==, mlxp); 1382 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1383 list_destroy(&b->mlb_tx_chain); 1384 } 1385 1386 mlxcx_buf_shard_t * 1387 mlxcx_mlbs_create(mlxcx_t *mlxp) 1388 { 1389 mlxcx_buf_shard_t *s; 1390 1391 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1392 1393 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1394 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1395 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1396 offsetof(mlxcx_buffer_t, mlb_entry)); 1397 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1398 offsetof(mlxcx_buffer_t, mlb_entry)); 1399 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1400 offsetof(mlxcx_buffer_t, mlb_entry)); 1401 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1402 1403 list_insert_tail(&mlxp->mlx_buf_shards, s); 1404 1405 return (s); 1406 } 1407 1408 static boolean_t 1409 mlxcx_setup_bufs(mlxcx_t *mlxp) 1410 { 1411 char namebuf[KSTAT_STRLEN]; 1412 1413 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1414 ddi_get_instance(mlxp->mlx_dip)); 1415 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1416 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1417 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1418 NULL, mlxp, NULL, 0); 1419 1420 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1421 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1422 1423 return (B_TRUE); 1424 } 1425 1426 static void 1427 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1428 const char *state, uint8_t statenum) 1429 { 1430 uint64_t ena; 1431 char buf[FM_MAX_CLASS]; 1432 1433 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1434 return; 1435 1436 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1437 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1438 ena = fm_ena_generate(0, FM_ENA_FMT1); 1439 1440 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1441 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1442 "state", DATA_TYPE_STRING, state, 1443 "state_num", DATA_TYPE_UINT8, statenum, 1444 "qtype", DATA_TYPE_STRING, qtype, 1445 "qnum", DATA_TYPE_UINT32, qnum, 1446 NULL); 1447 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1448 } 1449 1450 /* 1451 * The following set of routines are for monitoring the health of 1452 * event, completion and work queues. They run infrequently peeking at 1453 * the structs to catch stalls and inconsistent state. 1454 * 1455 * They peek at the structs *without* acquiring locks - we don't want 1456 * to impede flow of data. Driver start up and shutdown semantics 1457 * guarantee the structs are present and won't disappear underneath 1458 * these routines. 1459 * 1460 * As previously noted, the routines peek at active data in the structs and 1461 * they will store some values for comparison on next invocation. To 1462 * maintain integrity of the saved values, these values are only modified 1463 * within these routines. 1464 */ 1465 static void 1466 mlxcx_eq_check(void *arg) 1467 { 1468 mlxcx_t *mlxp = (mlxcx_t *)arg; 1469 mlxcx_event_queue_t *eq; 1470 mlxcx_eventq_ctx_t ctx; 1471 const char *str; 1472 1473 uint_t i; 1474 1475 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1476 eq = &mlxp->mlx_eqs[i]; 1477 1478 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0) 1479 continue; 1480 1481 /* 1482 * If the event queue was successfully created in the HCA, 1483 * then initialization and shutdown sequences guarantee 1484 * the queue exists. 1485 */ 1486 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); 1487 1488 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) 1489 continue; 1490 1491 str = "???"; 1492 switch (ctx.mleqc_status) { 1493 case MLXCX_EQ_STATUS_OK: 1494 break; 1495 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1496 str = "WRITE_FAILURE"; 1497 break; 1498 } 1499 1500 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1501 mlxcx_fm_qstate_ereport(mlxp, "event", 1502 eq->mleq_num, str, ctx.mleqc_status); 1503 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1504 eq->mleq_intr_index, ctx.mleqc_status, str); 1505 } 1506 1507 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1508 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1509 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1510 ++eq->mleq_check_disarm_cnt >= 3) { 1511 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1512 mlxcx_warn(mlxp, "EQ %u isn't armed", 1513 eq->mleq_intr_index); 1514 } 1515 eq->mleq_check_disarm_cc = eq->mleq_cc; 1516 } else { 1517 eq->mleq_check_disarm_cc = 0; 1518 eq->mleq_check_disarm_cnt = 0; 1519 } 1520 } 1521 } 1522 1523 static void 1524 mlxcx_cq_check(void *arg) 1525 { 1526 mlxcx_t *mlxp = (mlxcx_t *)arg; 1527 mlxcx_completion_queue_t *cq; 1528 mlxcx_completionq_ctx_t ctx; 1529 const char *str, *type; 1530 uint_t v; 1531 1532 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1533 cq = list_next(&mlxp->mlx_cqs, cq)) { 1534 1535 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0) 1536 continue; 1537 1538 /* 1539 * If the completion queue was successfully created in the HCA, 1540 * then initialization and shutdown sequences guarantee 1541 * the queue exists. 1542 */ 1543 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED); 1544 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN); 1545 1546 if (cq->mlcq_fm_repd_qstate) 1547 continue; 1548 1549 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) 1550 continue; 1551 1552 if (cq->mlcq_wq != NULL) { 1553 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1554 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1555 type = "rx "; 1556 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1557 type = "tx "; 1558 else 1559 type = ""; 1560 } else { 1561 type = ""; 1562 } 1563 1564 str = "???"; 1565 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1566 switch (v) { 1567 case MLXCX_CQC_STATUS_OK: 1568 break; 1569 case MLXCX_CQC_STATUS_OVERFLOW: 1570 str = "OVERFLOW"; 1571 break; 1572 case MLXCX_CQC_STATUS_WRITE_FAIL: 1573 str = "WRITE_FAIL"; 1574 break; 1575 case MLXCX_CQC_STATUS_INVALID: 1576 str = "INVALID"; 1577 break; 1578 } 1579 1580 if (v != MLXCX_CQC_STATUS_OK) { 1581 mlxcx_fm_qstate_ereport(mlxp, "completion", 1582 cq->mlcq_num, str, v); 1583 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1584 type, cq->mlcq_num, v, str); 1585 cq->mlcq_fm_repd_qstate = B_TRUE; 1586 } 1587 1588 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1589 if (v != MLXCX_CQC_STATE_ARMED && 1590 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1591 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1592 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1593 ++cq->mlcq_check_disarm_cnt >= 3) { 1594 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1595 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1596 type, cq->mlcq_num, cq); 1597 } 1598 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1599 } else { 1600 cq->mlcq_check_disarm_cnt = 0; 1601 cq->mlcq_check_disarm_cc = 0; 1602 } 1603 } 1604 } 1605 1606 void 1607 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1608 { 1609 mlxcx_sq_ctx_t ctx; 1610 mlxcx_sq_state_t state; 1611 1612 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1613 return; 1614 1615 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1616 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1617 switch (state) { 1618 case MLXCX_SQ_STATE_RST: 1619 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1620 mlxcx_fm_qstate_ereport(mlxp, "send", 1621 sq->mlwq_num, "RST", state); 1622 sq->mlwq_fm_repd_qstate = B_TRUE; 1623 } 1624 break; 1625 case MLXCX_SQ_STATE_RDY: 1626 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1627 mlxcx_fm_qstate_ereport(mlxp, "send", 1628 sq->mlwq_num, "RDY", state); 1629 sq->mlwq_fm_repd_qstate = B_TRUE; 1630 } 1631 break; 1632 case MLXCX_SQ_STATE_ERR: 1633 mlxcx_fm_qstate_ereport(mlxp, "send", 1634 sq->mlwq_num, "ERR", state); 1635 sq->mlwq_fm_repd_qstate = B_TRUE; 1636 break; 1637 default: 1638 mlxcx_fm_qstate_ereport(mlxp, "send", 1639 sq->mlwq_num, "???", state); 1640 sq->mlwq_fm_repd_qstate = B_TRUE; 1641 break; 1642 } 1643 } 1644 1645 void 1646 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1647 { 1648 mlxcx_rq_ctx_t ctx; 1649 mlxcx_rq_state_t state; 1650 1651 1652 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1653 return; 1654 1655 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1656 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1657 switch (state) { 1658 case MLXCX_RQ_STATE_RST: 1659 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1660 mlxcx_fm_qstate_ereport(mlxp, "receive", 1661 rq->mlwq_num, "RST", state); 1662 rq->mlwq_fm_repd_qstate = B_TRUE; 1663 } 1664 break; 1665 case MLXCX_RQ_STATE_RDY: 1666 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1667 mlxcx_fm_qstate_ereport(mlxp, "receive", 1668 rq->mlwq_num, "RDY", state); 1669 rq->mlwq_fm_repd_qstate = B_TRUE; 1670 } 1671 break; 1672 case MLXCX_RQ_STATE_ERR: 1673 mlxcx_fm_qstate_ereport(mlxp, "receive", 1674 rq->mlwq_num, "ERR", state); 1675 rq->mlwq_fm_repd_qstate = B_TRUE; 1676 break; 1677 default: 1678 mlxcx_fm_qstate_ereport(mlxp, "receive", 1679 rq->mlwq_num, "???", state); 1680 rq->mlwq_fm_repd_qstate = B_TRUE; 1681 break; 1682 } 1683 } 1684 1685 static void 1686 mlxcx_wq_check(void *arg) 1687 { 1688 mlxcx_t *mlxp = (mlxcx_t *)arg; 1689 mlxcx_work_queue_t *wq; 1690 1691 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1692 wq = list_next(&mlxp->mlx_wqs, wq)) { 1693 1694 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0) 1695 continue; 1696 1697 /* 1698 * If the work queue was successfully created in the HCA, 1699 * then initialization and shutdown sequences guarantee 1700 * the queue exists. 1701 */ 1702 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED); 1703 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN); 1704 1705 if (wq->mlwq_fm_repd_qstate) 1706 continue; 1707 1708 switch (wq->mlwq_type) { 1709 case MLXCX_WQ_TYPE_SENDQ: 1710 mlxcx_check_sq(mlxp, wq); 1711 break; 1712 case MLXCX_WQ_TYPE_RECVQ: 1713 mlxcx_check_rq(mlxp, wq); 1714 break; 1715 } 1716 } 1717 } 1718 1719 static boolean_t 1720 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1721 { 1722 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1723 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1724 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1725 DDI_IPL_0); 1726 } 1727 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1728 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1729 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1730 DDI_IPL_0); 1731 } 1732 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1733 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1734 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1735 DDI_IPL_0); 1736 } 1737 return (B_TRUE); 1738 } 1739 1740 int 1741 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1742 { 1743 const mlxcx_flow_entry_t *left = arg0; 1744 const mlxcx_flow_entry_t *right = arg1; 1745 int bcmpr; 1746 1747 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1748 sizeof (left->mlfe_dmac)); 1749 if (bcmpr < 0) 1750 return (-1); 1751 if (bcmpr > 0) 1752 return (1); 1753 if (left->mlfe_vid < right->mlfe_vid) 1754 return (-1); 1755 if (left->mlfe_vid > right->mlfe_vid) 1756 return (1); 1757 return (0); 1758 } 1759 1760 int 1761 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1762 { 1763 const mlxcx_group_mac_t *left = arg0; 1764 const mlxcx_group_mac_t *right = arg1; 1765 int bcmpr; 1766 1767 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1768 sizeof (left->mlgm_mac)); 1769 if (bcmpr < 0) 1770 return (-1); 1771 if (bcmpr > 0) 1772 return (1); 1773 return (0); 1774 } 1775 1776 int 1777 mlxcx_page_compare(const void *arg0, const void *arg1) 1778 { 1779 const mlxcx_dev_page_t *p0 = arg0; 1780 const mlxcx_dev_page_t *p1 = arg1; 1781 1782 if (p0->mxdp_pa < p1->mxdp_pa) 1783 return (-1); 1784 if (p0->mxdp_pa > p1->mxdp_pa) 1785 return (1); 1786 return (0); 1787 } 1788 1789 static boolean_t 1790 mlxcx_setup_ports(mlxcx_t *mlxp) 1791 { 1792 uint_t i, j; 1793 mlxcx_port_t *p; 1794 mlxcx_flow_table_t *ft; 1795 mlxcx_flow_group_t *fg; 1796 mlxcx_flow_entry_t *fe; 1797 1798 VERIFY3U(mlxp->mlx_nports, >, 0); 1799 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1800 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1801 1802 for (i = 0; i < mlxp->mlx_nports; ++i) { 1803 p = &mlxp->mlx_ports[i]; 1804 p->mlp_num = i; 1805 p->mlx_port_event.mla_mlx = mlxp; 1806 p->mlx_port_event.mla_port = p; 1807 mutex_init(&p->mlx_port_event.mla_mtx, NULL, 1808 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 1809 p->mlp_init |= MLXCX_PORT_INIT; 1810 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1811 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1812 mutex_enter(&p->mlp_mtx); 1813 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1814 mutex_exit(&p->mlp_mtx); 1815 goto err; 1816 } 1817 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1818 mutex_exit(&p->mlp_mtx); 1819 goto err; 1820 } 1821 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1822 mutex_exit(&p->mlp_mtx); 1823 goto err; 1824 } 1825 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1826 mutex_exit(&p->mlp_mtx); 1827 goto err; 1828 } 1829 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1830 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1831 mutex_exit(&p->mlp_mtx); 1832 goto err; 1833 } 1834 if (!mlxcx_cmd_query_port_fec(mlxp, p)) { 1835 mutex_exit(&p->mlp_mtx); 1836 goto err; 1837 } 1838 p->mlp_fec_requested = LINK_FEC_AUTO; 1839 1840 mutex_exit(&p->mlp_mtx); 1841 } 1842 1843 for (i = 0; i < mlxp->mlx_nports; ++i) { 1844 p = &mlxp->mlx_ports[i]; 1845 mutex_enter(&p->mlp_mtx); 1846 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1847 KM_SLEEP)); 1848 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1849 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1850 1851 mutex_enter(&ft->mlft_mtx); 1852 1853 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1854 ft->mlft_port = p; 1855 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1856 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1857 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1858 ft->mlft_nents = (1 << ft->mlft_entshift); 1859 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1860 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1861 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1862 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1863 1864 for (j = 0; j < ft->mlft_nents; ++j) { 1865 ft->mlft_ent[j].mlfe_table = ft; 1866 ft->mlft_ent[j].mlfe_index = j; 1867 } 1868 1869 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1870 mutex_exit(&ft->mlft_mtx); 1871 mutex_exit(&p->mlp_mtx); 1872 goto err; 1873 } 1874 1875 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1876 mutex_exit(&ft->mlft_mtx); 1877 mutex_exit(&p->mlp_mtx); 1878 goto err; 1879 } 1880 1881 /* 1882 * We match broadcast at the top of the root flow table, then 1883 * all multicast/unicast MACs, then the promisc entry is down 1884 * the very bottom. 1885 * 1886 * This way when promisc is on, that entry simply catches any 1887 * remaining traffic that earlier flows haven't matched. 1888 */ 1889 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1890 list_insert_tail(&ft->mlft_groups, fg); 1891 fg->mlfg_table = ft; 1892 fg->mlfg_size = 1; 1893 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1894 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1895 mutex_exit(&ft->mlft_mtx); 1896 mutex_exit(&p->mlp_mtx); 1897 goto err; 1898 } 1899 p->mlp_bcast = fg; 1900 fe = list_head(&fg->mlfg_entries); 1901 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1902 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1903 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1904 1905 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1906 list_insert_tail(&ft->mlft_groups, fg); 1907 fg->mlfg_table = ft; 1908 fg->mlfg_size = ft->mlft_nents - 2; 1909 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1910 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1911 mutex_exit(&ft->mlft_mtx); 1912 mutex_exit(&p->mlp_mtx); 1913 goto err; 1914 } 1915 p->mlp_umcast = fg; 1916 1917 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1918 list_insert_tail(&ft->mlft_groups, fg); 1919 fg->mlfg_table = ft; 1920 fg->mlfg_size = 1; 1921 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1922 mutex_exit(&ft->mlft_mtx); 1923 mutex_exit(&p->mlp_mtx); 1924 goto err; 1925 } 1926 p->mlp_promisc = fg; 1927 fe = list_head(&fg->mlfg_entries); 1928 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1929 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1930 1931 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1932 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1933 mlfe_dmac_entry)); 1934 1935 mutex_exit(&ft->mlft_mtx); 1936 mutex_exit(&p->mlp_mtx); 1937 } 1938 1939 return (B_TRUE); 1940 1941 err: 1942 mlxcx_teardown_ports(mlxp); 1943 return (B_FALSE); 1944 } 1945 1946 void 1947 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1948 { 1949 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1950 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1951 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1952 mlxcx_flow_entry_t *fe; 1953 mlxcx_group_vlan_t *v; 1954 1955 ASSERT(mutex_owned(&g->mlg_mtx)); 1956 1957 mutex_enter(&ft->mlft_mtx); 1958 1959 if (!list_is_empty(&g->mlg_rx_vlans)) { 1960 fe = list_head(&dfg->mlfg_entries); 1961 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1962 } 1963 1964 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1965 fe = v->mlgv_fe; 1966 ASSERT3P(fe->mlfe_table, ==, ft); 1967 ASSERT3P(fe->mlfe_group, ==, fg); 1968 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1969 1970 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1971 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1972 } 1973 1974 mutex_exit(&ft->mlft_mtx); 1975 } 1976 1977 boolean_t 1978 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1979 boolean_t tagged, uint16_t vid) 1980 { 1981 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1982 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1983 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1984 mlxcx_flow_entry_t *fe; 1985 mlxcx_group_vlan_t *v; 1986 boolean_t found = B_FALSE; 1987 1988 ASSERT(mutex_owned(&g->mlg_mtx)); 1989 1990 mutex_enter(&ft->mlft_mtx); 1991 1992 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1993 v = list_next(&g->mlg_rx_vlans, v)) { 1994 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1995 found = B_TRUE; 1996 break; 1997 } 1998 } 1999 if (!found) { 2000 mutex_exit(&ft->mlft_mtx); 2001 return (B_FALSE); 2002 } 2003 2004 list_remove(&g->mlg_rx_vlans, v); 2005 2006 /* 2007 * If this is the last VLAN entry, we have to go back to accepting 2008 * any VLAN (which means re-enabling the default entry). 2009 * 2010 * Do this before we remove the flow entry for the last specific 2011 * VLAN so that we don't lose any traffic in the transition. 2012 */ 2013 if (list_is_empty(&g->mlg_rx_vlans)) { 2014 fe = list_head(&dfg->mlfg_entries); 2015 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2016 list_insert_tail(&g->mlg_rx_vlans, v); 2017 mutex_exit(&ft->mlft_mtx); 2018 return (B_FALSE); 2019 } 2020 } 2021 2022 fe = v->mlgv_fe; 2023 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 2024 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 2025 ASSERT3P(fe->mlfe_table, ==, ft); 2026 ASSERT3P(fe->mlfe_group, ==, fg); 2027 2028 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 2029 list_insert_tail(&g->mlg_rx_vlans, v); 2030 fe = list_head(&dfg->mlfg_entries); 2031 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 2032 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2033 } 2034 mutex_exit(&ft->mlft_mtx); 2035 return (B_FALSE); 2036 } 2037 2038 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2039 2040 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2041 2042 mutex_exit(&ft->mlft_mtx); 2043 return (B_TRUE); 2044 } 2045 2046 boolean_t 2047 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 2048 uint16_t vid) 2049 { 2050 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2051 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2052 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2053 mlxcx_flow_entry_t *fe; 2054 mlxcx_group_vlan_t *v; 2055 boolean_t found = B_FALSE; 2056 boolean_t first = B_FALSE; 2057 2058 ASSERT(mutex_owned(&g->mlg_mtx)); 2059 2060 mutex_enter(&ft->mlft_mtx); 2061 2062 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2063 v = list_next(&g->mlg_rx_vlans, v)) { 2064 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2065 mutex_exit(&ft->mlft_mtx); 2066 return (B_TRUE); 2067 } 2068 } 2069 if (list_is_empty(&g->mlg_rx_vlans)) 2070 first = B_TRUE; 2071 2072 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2073 fe = list_next(&fg->mlfg_entries, fe)) { 2074 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2075 found = B_TRUE; 2076 break; 2077 } 2078 } 2079 if (!found) { 2080 mutex_exit(&ft->mlft_mtx); 2081 return (B_FALSE); 2082 } 2083 2084 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2085 v->mlgv_fe = fe; 2086 v->mlgv_tagged = tagged; 2087 v->mlgv_vid = vid; 2088 2089 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2090 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2091 fe->mlfe_vid = vid; 2092 if (tagged) { 2093 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2094 } else { 2095 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2096 } 2097 2098 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2099 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2100 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2101 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2102 mutex_exit(&ft->mlft_mtx); 2103 return (B_FALSE); 2104 } 2105 2106 list_insert_tail(&g->mlg_rx_vlans, v); 2107 2108 /* 2109 * If the vlan list was empty for this group before adding this one, 2110 * then we no longer want the "default" entry to allow all VLANs 2111 * through. 2112 */ 2113 if (first) { 2114 fe = list_head(&dfg->mlfg_entries); 2115 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2116 } 2117 2118 mutex_exit(&ft->mlft_mtx); 2119 return (B_TRUE); 2120 } 2121 2122 void 2123 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2124 mlxcx_ring_group_t *group) 2125 { 2126 mlxcx_flow_entry_t *fe; 2127 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2128 mlxcx_group_mac_t *gm, *ngm; 2129 2130 ASSERT(mutex_owned(&port->mlp_mtx)); 2131 ASSERT(mutex_owned(&group->mlg_mtx)); 2132 2133 mutex_enter(&ft->mlft_mtx); 2134 2135 gm = avl_first(&group->mlg_rx_macs); 2136 for (; gm != NULL; gm = ngm) { 2137 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2138 2139 ASSERT3P(gm->mlgm_group, ==, group); 2140 fe = gm->mlgm_fe; 2141 ASSERT3P(fe->mlfe_table, ==, ft); 2142 2143 avl_remove(&group->mlg_rx_macs, gm); 2144 list_remove(&fe->mlfe_ring_groups, gm); 2145 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2146 2147 fe->mlfe_ndest = 0; 2148 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2149 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2150 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2151 gm->mlgm_group->mlg_rx_vlan_ft; 2152 } 2153 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2154 2155 if (fe->mlfe_ndest > 0) { 2156 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2157 continue; 2158 } 2159 2160 /* 2161 * There are no more ring groups left for this MAC (it wasn't 2162 * attached to any other groups since ndest == 0), so clean up 2163 * its flow entry. 2164 */ 2165 avl_remove(&port->mlp_dmac_fe, fe); 2166 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2167 list_destroy(&fe->mlfe_ring_groups); 2168 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2169 } 2170 2171 mutex_exit(&ft->mlft_mtx); 2172 } 2173 2174 boolean_t 2175 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2176 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2177 { 2178 mlxcx_flow_entry_t *fe; 2179 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2180 mlxcx_group_mac_t *gm, probe; 2181 2182 ASSERT(mutex_owned(&port->mlp_mtx)); 2183 ASSERT(mutex_owned(&group->mlg_mtx)); 2184 2185 bzero(&probe, sizeof (probe)); 2186 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2187 2188 mutex_enter(&ft->mlft_mtx); 2189 2190 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2191 if (gm == NULL) { 2192 mutex_exit(&ft->mlft_mtx); 2193 return (B_FALSE); 2194 } 2195 ASSERT3P(gm->mlgm_group, ==, group); 2196 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2197 2198 fe = gm->mlgm_fe; 2199 ASSERT3P(fe->mlfe_table, ==, ft); 2200 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2201 2202 list_remove(&fe->mlfe_ring_groups, gm); 2203 avl_remove(&group->mlg_rx_macs, gm); 2204 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2205 2206 fe->mlfe_ndest = 0; 2207 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2208 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2209 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2210 gm->mlgm_group->mlg_rx_vlan_ft; 2211 } 2212 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2213 2214 if (fe->mlfe_ndest > 0) { 2215 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2216 mutex_exit(&ft->mlft_mtx); 2217 return (B_FALSE); 2218 } 2219 mutex_exit(&ft->mlft_mtx); 2220 return (B_TRUE); 2221 } 2222 2223 /* 2224 * There are no more ring groups left for this MAC (it wasn't attached 2225 * to any other groups since ndest == 0), so clean up its flow entry. 2226 */ 2227 avl_remove(&port->mlp_dmac_fe, fe); 2228 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2229 list_destroy(&fe->mlfe_ring_groups); 2230 2231 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2232 2233 mutex_exit(&ft->mlft_mtx); 2234 2235 return (B_TRUE); 2236 } 2237 2238 boolean_t 2239 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2240 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2241 { 2242 mlxcx_flow_group_t *fg; 2243 mlxcx_flow_entry_t *fe, probe; 2244 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2245 mlxcx_group_mac_t *gm; 2246 boolean_t found = B_FALSE; 2247 2248 ASSERT(mutex_owned(&port->mlp_mtx)); 2249 ASSERT(mutex_owned(&group->mlg_mtx)); 2250 2251 bzero(&probe, sizeof (probe)); 2252 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2253 2254 mutex_enter(&ft->mlft_mtx); 2255 2256 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2257 2258 if (fe == NULL) { 2259 fg = port->mlp_umcast; 2260 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2261 fe = list_next(&fg->mlfg_entries, fe)) { 2262 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2263 found = B_TRUE; 2264 break; 2265 } 2266 } 2267 if (!found) { 2268 mutex_exit(&ft->mlft_mtx); 2269 return (B_FALSE); 2270 } 2271 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2272 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2273 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2274 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2275 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2276 2277 avl_add(&port->mlp_dmac_fe, fe); 2278 } 2279 2280 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2281 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2282 2283 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2284 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2285 if (--fe->mlfe_ndest == 0) { 2286 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2287 } 2288 mutex_exit(&ft->mlft_mtx); 2289 return (B_FALSE); 2290 } 2291 2292 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2293 gm->mlgm_group = group; 2294 gm->mlgm_fe = fe; 2295 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2296 avl_add(&group->mlg_rx_macs, gm); 2297 list_insert_tail(&fe->mlfe_ring_groups, gm); 2298 2299 mutex_exit(&ft->mlft_mtx); 2300 2301 return (B_TRUE); 2302 } 2303 2304 boolean_t 2305 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2306 mlxcx_flow_group_t *fg) 2307 { 2308 mlxcx_flow_entry_t *fe; 2309 uint_t i, idx; 2310 2311 ASSERT(mutex_owned(&ft->mlft_mtx)); 2312 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2313 ASSERT3P(fg->mlfg_table, ==, ft); 2314 2315 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2316 return (B_FALSE); 2317 fg->mlfg_start_idx = ft->mlft_next_ent; 2318 2319 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2320 return (B_FALSE); 2321 } 2322 2323 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2324 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2325 for (i = 0; i < fg->mlfg_size; ++i) { 2326 idx = fg->mlfg_start_idx + i; 2327 fe = &ft->mlft_ent[idx]; 2328 fe->mlfe_group = fg; 2329 list_insert_tail(&fg->mlfg_entries, fe); 2330 } 2331 fg->mlfg_avail = fg->mlfg_size; 2332 ft->mlft_next_ent += fg->mlfg_size; 2333 2334 return (B_TRUE); 2335 } 2336 2337 static boolean_t 2338 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events) 2339 { 2340 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec]; 2341 2342 mutex_enter(&mleq->mleq_mtx); 2343 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2344 /* mlxcx_teardown_eqs() will clean this up */ 2345 mutex_exit(&mleq->mleq_mtx); 2346 return (B_FALSE); 2347 } 2348 mleq->mleq_mlx = mlxp; 2349 mleq->mleq_uar = &mlxp->mlx_uar; 2350 mleq->mleq_events = events; 2351 mleq->mleq_intr_index = vec; 2352 2353 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2354 /* mlxcx_teardown_eqs() will clean this up */ 2355 mutex_exit(&mleq->mleq_mtx); 2356 return (B_FALSE); 2357 } 2358 2359 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) { 2360 /* 2361 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2362 * eq_rele_dma 2363 */ 2364 mutex_exit(&mleq->mleq_mtx); 2365 return (B_FALSE); 2366 } 2367 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2368 mlxcx_arm_eq(mlxp, mleq); 2369 mutex_exit(&mleq->mleq_mtx); 2370 2371 return (B_TRUE); 2372 } 2373 2374 static boolean_t 2375 mlxcx_setup_async_eqs(mlxcx_t *mlxp) 2376 { 2377 boolean_t ret; 2378 2379 ret = mlxcx_setup_eq(mlxp, 0, 2380 (1ULL << MLXCX_EVENT_CMD_COMPLETION) | 2381 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2382 (1ULL << MLXCX_EVENT_PORT_STATE) | 2383 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2384 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2385 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2386 (1ULL << MLXCX_EVENT_LAST_WQE) | 2387 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2388 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2389 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2390 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2391 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2392 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2393 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST)); 2394 2395 if (ret) 2396 mlxcx_cmd_eq_enable(mlxp); 2397 2398 return (ret); 2399 } 2400 2401 int 2402 mlxcx_cq_compare(const void *arg0, const void *arg1) 2403 { 2404 const mlxcx_completion_queue_t *left = arg0; 2405 const mlxcx_completion_queue_t *right = arg1; 2406 2407 if (left->mlcq_num < right->mlcq_num) { 2408 return (-1); 2409 } 2410 if (left->mlcq_num > right->mlcq_num) { 2411 return (1); 2412 } 2413 return (0); 2414 } 2415 2416 static boolean_t 2417 mlxcx_setup_eqs(mlxcx_t *mlxp) 2418 { 2419 uint_t i; 2420 mlxcx_event_queue_t *mleq; 2421 2422 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2423 2424 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 2425 mleq = &mlxp->mlx_eqs[i]; 2426 mutex_enter(&mleq->mleq_mtx); 2427 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2428 mutex_exit(&mleq->mleq_mtx); 2429 return (B_FALSE); 2430 } 2431 mleq->mleq_uar = &mlxp->mlx_uar; 2432 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2433 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2434 mutex_exit(&mleq->mleq_mtx); 2435 return (B_FALSE); 2436 } 2437 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2438 !mlxcx_cmd_set_int_mod(mlxp, i, 2439 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2440 mutex_exit(&mleq->mleq_mtx); 2441 return (B_FALSE); 2442 } 2443 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2444 mutex_exit(&mleq->mleq_mtx); 2445 return (B_FALSE); 2446 } 2447 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2448 mlxcx_arm_eq(mlxp, mleq); 2449 mutex_exit(&mleq->mleq_mtx); 2450 } 2451 2452 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 2453 2454 return (B_TRUE); 2455 } 2456 2457 /* 2458 * Snapshot all of the hardware capabilities that we care about and then modify 2459 * the HCA capabilities to get things moving. 2460 */ 2461 static boolean_t 2462 mlxcx_init_caps(mlxcx_t *mlxp) 2463 { 2464 mlxcx_caps_t *c; 2465 2466 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2467 2468 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2469 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2470 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2471 } 2472 2473 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2474 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2475 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2476 } 2477 2478 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2479 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2480 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2481 } 2482 2483 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2484 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2485 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2486 } 2487 2488 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2489 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2490 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2491 } 2492 2493 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2494 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2495 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2496 } 2497 2498 /* 2499 * Check the caps meet our requirements. 2500 */ 2501 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2502 2503 if (gen->mlcap_general_log_pg_sz != 12) { 2504 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2505 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2506 goto err; 2507 } 2508 if (gen->mlcap_general_cqe_version != 1) { 2509 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2510 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2511 goto err; 2512 } 2513 if (gen->mlcap_general_port_type != 2514 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2515 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2516 goto err; 2517 } 2518 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2519 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2520 2521 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2522 2523 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2524 MLXCX_ETH_CAP_CSUM_CAP); 2525 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2526 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2527 2528 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2529 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2530 if (c->mlc_max_lso_size == 1) { 2531 c->mlc_max_lso_size = 0; 2532 c->mlc_lso = B_FALSE; 2533 } else { 2534 c->mlc_lso = B_TRUE; 2535 } 2536 2537 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2538 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2539 2540 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2541 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2542 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2543 goto err; 2544 } 2545 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2546 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2547 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2548 "flow table entries"); 2549 goto err; 2550 } 2551 2552 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2553 mlcap_flow_prop_log_max_ft_size; 2554 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2555 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2556 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow. 2557 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num); 2558 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2559 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2560 2561 return (B_TRUE); 2562 2563 err: 2564 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2565 return (B_FALSE); 2566 } 2567 2568 static int 2569 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2570 { 2571 mlxcx_t *mlxp; 2572 2573 if (cmd != DDI_DETACH) 2574 return (DDI_FAILURE); 2575 2576 mlxp = ddi_get_driver_private(dip); 2577 if (mlxp == NULL) { 2578 mlxcx_warn(NULL, "asked to detach, but missing instance " 2579 "private data"); 2580 return (DDI_FAILURE); 2581 } 2582 2583 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2584 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2585 return (DDI_FAILURE); 2586 } 2587 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2588 } 2589 2590 mlxcx_teardown(mlxp); 2591 return (DDI_SUCCESS); 2592 } 2593 2594 static size_t 2595 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2596 { 2597 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2598 mlxp->mlx_props.mldp_rx_ngroups_small; 2599 size_t tirlim, flowlim, gflowlim; 2600 2601 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2602 if (tirlim < ngroups) { 2603 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2604 "on number of TIRs available", tirlim); 2605 ngroups = tirlim; 2606 } 2607 2608 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2609 if (flowlim < ngroups) { 2610 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2611 "on max size of RX flow tables", flowlim); 2612 ngroups = flowlim; 2613 } 2614 2615 /* 2616 * Restrict the number of groups not to exceed the max flow 2617 * table number from the devices capabilities. 2618 * There is one root table entry per port and 2 entries per 2619 * group. 2620 */ 2621 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2; 2622 if (flowlim < ngroups) { 2623 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2624 "on max number of RX flow tables", 2625 flowlim); 2626 ngroups = flowlim; 2627 } 2628 2629 do { 2630 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2631 if (gflowlim < ngroups) { 2632 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2633 "based on max total RX flows", gflowlim); 2634 --ngroups; 2635 } 2636 } while (gflowlim < ngroups); 2637 2638 return (ngroups); 2639 } 2640 2641 static int 2642 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2643 { 2644 mlxcx_t *mlxp; 2645 char tq_name[TASKQ_NAMELEN]; 2646 uint_t i; 2647 int inst, ret; 2648 2649 if (cmd != DDI_ATTACH) 2650 return (DDI_FAILURE); 2651 2652 inst = ddi_get_instance(dip); 2653 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2654 if (ret != 0) 2655 return (ret); 2656 2657 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2658 if (mlxp == NULL) 2659 return (DDI_FAILURE); 2660 mlxp->mlx_dip = dip; 2661 mlxp->mlx_inst = inst; 2662 ddi_set_driver_private(dip, mlxp); 2663 2664 mlxcx_load_props(mlxp); 2665 2666 mlxcx_fm_init(mlxp); 2667 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2668 2669 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2670 DDI_SUCCESS) { 2671 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2672 goto err; 2673 } 2674 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2675 2676 if (!mlxcx_regs_map(mlxp)) { 2677 goto err; 2678 } 2679 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2680 2681 if (!mlxcx_cmd_queue_init(mlxp)) { 2682 goto err; 2683 } 2684 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2685 2686 if (!mlxcx_cmd_enable_hca(mlxp)) { 2687 goto err; 2688 } 2689 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2690 2691 if (!mlxcx_check_issi(mlxp)) { 2692 goto err; 2693 } 2694 2695 /* 2696 * We have to get our interrupts now so we know what priority to 2697 * create pagemtx with. 2698 */ 2699 if (!mlxcx_intr_setup(mlxp)) { 2700 goto err; 2701 } 2702 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2703 2704 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2705 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2706 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2707 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2708 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2709 2710 /* 2711 * Taskq for asynchronous events which may interact with the HCA 2712 * via the command interface. Single threaded FIFO. 2713 */ 2714 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d", 2715 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst); 2716 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX, 2717 TASKQ_PREPOPULATE); 2718 /* 2719 * Initialize any pre-allocated taskq param structs. 2720 */ 2721 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 2722 mlxp->mlx_npages_req[i].mla_mlx = mlxp; 2723 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL, 2724 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 2725 } 2726 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ; 2727 2728 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2729 goto err; 2730 } 2731 2732 if (!mlxcx_init_caps(mlxp)) { 2733 goto err; 2734 } 2735 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2736 2737 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2738 goto err; 2739 } 2740 2741 if (!mlxcx_cmd_init_hca(mlxp)) { 2742 goto err; 2743 } 2744 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2745 2746 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2747 goto err; 2748 } 2749 2750 /* 2751 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2752 * doorbells. 2753 */ 2754 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2755 goto err; 2756 } 2757 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2758 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2759 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2760 } 2761 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2762 2763 /* 2764 * Set up asynchronous event queue which handles control type events 2765 * like PAGE_REQUEST and CMD completion events. 2766 * 2767 * This will enable and arm the interrupt on EQ 0. 2768 */ 2769 if (!mlxcx_setup_async_eqs(mlxp)) { 2770 goto err; 2771 } 2772 2773 /* 2774 * Allocate a protection and transport domain. These don't really do 2775 * anything for us (they're IB concepts), but we need to give their 2776 * ID numbers in other commands. 2777 */ 2778 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2779 goto err; 2780 } 2781 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2782 goto err; 2783 } 2784 /* 2785 * Fetch the "reserved" lkey that lets us give linear addresses in 2786 * work queue entries, rather than having to mess with the NIC's 2787 * internal MMU. 2788 */ 2789 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2790 goto err; 2791 } 2792 2793 /* 2794 * Query our port information and current state, populate the 2795 * mlxcx_port_t structs. 2796 * 2797 * This also sets up the root flow tables and flow groups. 2798 */ 2799 if (!mlxcx_setup_ports(mlxp)) { 2800 goto err; 2801 } 2802 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2803 2804 mlxcx_load_model_props(mlxp); 2805 2806 /* 2807 * Set up, enable and arm the rest of the interrupt EQs which will 2808 * service events from CQs. 2809 * 2810 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2811 * cleaned up. 2812 */ 2813 if (!mlxcx_setup_eqs(mlxp)) { 2814 goto err; 2815 } 2816 2817 /* Completion queues */ 2818 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2819 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2820 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2821 2822 /* Work queues (send queues, receive queues) */ 2823 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2824 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2825 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2826 2827 /* 2828 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2829 * "groups" we advertise to MAC. 2830 */ 2831 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2832 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2833 sizeof (mlxcx_ring_group_t); 2834 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2835 2836 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2837 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2838 sizeof (mlxcx_ring_group_t); 2839 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2840 2841 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2842 2843 /* 2844 * Sets up the free/busy buffers list for keeping track of packet 2845 * buffers. 2846 */ 2847 if (!mlxcx_setup_bufs(mlxp)) 2848 goto err; 2849 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2850 2851 /* 2852 * Before we tell MAC about our rings/groups, we need to do enough 2853 * setup on them to be sure about the numbers and configuration that 2854 * we have. This will do basically everything short of allocating 2855 * packet buffers and starting the rings up. 2856 */ 2857 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2858 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2859 goto err; 2860 } 2861 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2862 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2863 goto err; 2864 } 2865 2866 /* 2867 * Set up periodic fault check timers which check the queue states, 2868 * set up should be after all the queues have been initialized and 2869 * consequently the teardown of timers must happen before 2870 * queue teardown. 2871 */ 2872 if (!mlxcx_setup_checktimers(mlxp)) { 2873 goto err; 2874 } 2875 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2876 2877 /* 2878 * Some devices may not have a working temperature sensor; however, 2879 * there isn't a great way for us to know. We shouldn't fail attach if 2880 * this doesn't work. 2881 */ 2882 if (mlxcx_setup_sensors(mlxp)) { 2883 mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS; 2884 } 2885 2886 /* 2887 * Finally, tell MAC that we exist! 2888 */ 2889 if (!mlxcx_register_mac(mlxp)) { 2890 goto err; 2891 } 2892 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2893 2894 return (DDI_SUCCESS); 2895 2896 err: 2897 mlxcx_teardown(mlxp); 2898 return (DDI_FAILURE); 2899 } 2900 2901 static struct cb_ops mlxcx_cb_ops = { 2902 .cb_open = nulldev, 2903 .cb_close = nulldev, 2904 .cb_strategy = nodev, 2905 .cb_print = nodev, 2906 .cb_dump = nodev, 2907 .cb_read = nodev, 2908 .cb_write = nodev, 2909 .cb_ioctl = nodev, 2910 .cb_devmap = nodev, 2911 .cb_mmap = nodev, 2912 .cb_segmap = nodev, 2913 .cb_chpoll = nochpoll, 2914 .cb_prop_op = ddi_prop_op, 2915 .cb_flag = D_MP, 2916 .cb_rev = CB_REV, 2917 .cb_aread = nodev, 2918 .cb_awrite = nodev 2919 }; 2920 2921 static struct dev_ops mlxcx_dev_ops = { 2922 .devo_rev = DEVO_REV, 2923 .devo_refcnt = 0, 2924 .devo_getinfo = NULL, 2925 .devo_identify = nulldev, 2926 .devo_probe = nulldev, 2927 .devo_attach = mlxcx_attach, 2928 .devo_detach = mlxcx_detach, 2929 .devo_reset = nodev, 2930 .devo_quiesce = ddi_quiesce_not_supported, 2931 .devo_cb_ops = &mlxcx_cb_ops 2932 }; 2933 2934 static struct modldrv mlxcx_modldrv = { 2935 .drv_modops = &mod_driverops, 2936 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 2937 .drv_dev_ops = &mlxcx_dev_ops 2938 }; 2939 2940 static struct modlinkage mlxcx_modlinkage = { 2941 .ml_rev = MODREV_1, 2942 .ml_linkage = { &mlxcx_modldrv, NULL } 2943 }; 2944 2945 int 2946 _init(void) 2947 { 2948 int ret; 2949 2950 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 2951 if (ret != 0) { 2952 return (ret); 2953 } 2954 2955 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 2956 2957 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2958 mac_fini_ops(&mlxcx_dev_ops); 2959 ddi_soft_state_fini(&mlxcx_softstate); 2960 return (ret); 2961 } 2962 2963 return (DDI_SUCCESS); 2964 } 2965 2966 int 2967 _info(struct modinfo *modinfop) 2968 { 2969 return (mod_info(&mlxcx_modlinkage, modinfop)); 2970 } 2971 2972 int 2973 _fini(void) 2974 { 2975 int ret; 2976 2977 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2978 return (ret); 2979 } 2980 2981 mac_fini_ops(&mlxcx_dev_ops); 2982 2983 ddi_soft_state_fini(&mlxcx_softstate); 2984 2985 return (DDI_SUCCESS); 2986 } 2987