1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #ifndef _SYS_IB_CLIENTS_IBD_H 27 #define _SYS_IB_CLIENTS_IBD_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 /* The following macros are used in both ibd.c and ibd_cm.c */ 34 35 /* 36 * Completion queue polling control 37 */ 38 #define IBD_CQ_POLLING 0x1 39 #define IBD_REDO_CQ_POLLING 0x2 40 41 /* 42 * Maximum length for returning chained mps back to crossbow. 43 * Also used as the maximum number of rx wc's polled at a time. 44 */ 45 #define IBD_MAX_RX_MP_LEN 16 46 47 /* 48 * When doing multiple-send-wr, this value determines how many to do at 49 * a time (in a single ibt_post_send). 50 */ 51 #define IBD_MAX_TX_POST_MULTIPLE 4 52 53 /* 54 * Flag bits for resources to reap 55 */ 56 #define IBD_RSRC_SWQE 0x1 57 #define IBD_RSRC_LSOBUF 0x2 58 #define IBD_RSRC_RC_SWQE 0x4 59 #define IBD_RSRC_RC_TX_LARGEBUF 0x8 60 61 /* 62 * Async operation types 63 */ 64 #define IBD_ASYNC_GETAH 1 65 #define IBD_ASYNC_JOIN 2 66 #define IBD_ASYNC_LEAVE 3 67 #define IBD_ASYNC_PROMON 4 68 #define IBD_ASYNC_PROMOFF 5 69 #define IBD_ASYNC_REAP 6 70 #define IBD_ASYNC_TRAP 7 71 #define IBD_ASYNC_SCHED 8 72 #define IBD_ASYNC_LINK 9 73 #define IBD_ASYNC_EXIT 10 74 #define IBD_ASYNC_RC_TOO_BIG 11 75 #define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12 76 #define IBD_ASYNC_RC_RECYCLE_ACE 13 77 78 /* 79 * Miscellaneous constants 80 */ 81 #define IBD_SEND 0 82 #define IBD_RECV 1 83 84 /* Tunables defaults and limits */ 85 #define IBD_LINK_MODE_UD 0 86 #define IBD_LINK_MODE_RC 1 87 88 #define IBD_DEF_LINK_MODE IBD_LINK_MODE_RC 89 #define IBD_DEF_LSO_POLICY B_TRUE 90 #define IBD_DEF_NUM_LSO_BUFS 1024 91 #define IBD_DEF_CREATE_BCAST_GROUP B_TRUE 92 #define IBD_DEF_COALESCE_COMPLETIONS B_TRUE 93 #define IBD_DEF_UD_RX_COMP_COUNT 4 94 #define IBD_DEF_UD_RX_COMP_USEC 10 95 #define IBD_DEF_UD_TX_COMP_COUNT 16 96 #define IBD_DEF_UD_TX_COMP_USEC 300 97 #define IBD_DEF_RC_RX_COMP_COUNT 4 98 #define IBD_DEF_RC_RX_COMP_USEC 10 99 #define IBD_DEF_RC_TX_COMP_COUNT 10 100 #define IBD_DEF_RC_TX_COMP_USEC 300 101 #define IBD_DEF_UD_TX_COPY_THRESH 4096 102 #define IBD_DEF_RC_RX_COPY_THRESH 4096 103 #define IBD_DEF_RC_TX_COPY_THRESH 4096 104 #define IBD_DEF_UD_NUM_RWQE 4000 105 #define IBD_DEF_UD_NUM_SWQE 4000 106 #define IBD_DEF_RC_ENABLE_SRQ B_TRUE 107 #define IBD_DEF_RC_NUM_RWQE 2047 108 #define IBD_DEF_RC_NUM_SWQE 511 109 #define IBD_DEF_NUM_AH 256 110 #define IBD_DEF_HASH_SIZE 32 111 #define IBD_DEF_RC_NUM_SRQ (IBD_DEF_RC_NUM_RWQE - 1) 112 #define IBD_DEF_RC_RX_RWQE_THRESH (IBD_DEF_RC_NUM_RWQE >> 2) 113 114 /* Tunable limits */ 115 #define IBD_MIN_NUM_LSO_BUFS 512 116 #define IBD_MAX_NUM_LSO_BUFS 4096 117 #define IBD_MIN_UD_TX_COPY_THRESH 2048 118 #define IBD_MAX_UD_TX_COPY_THRESH 65536 119 #define IBD_MIN_UD_NUM_SWQE 512 120 #define IBD_MAX_UD_NUM_SWQE 8000 121 #define IBD_MIN_UD_NUM_RWQE 512 122 #define IBD_MAX_UD_NUM_RWQE 8000 123 #define IBD_MIN_NUM_AH 32 124 #define IBD_MAX_NUM_AH 8192 125 #define IBD_MIN_HASH_SIZE 32 126 #define IBD_MAX_HASH_SIZE 1024 127 128 #define IBD_MIN_RC_NUM_SWQE 511 129 #define IBD_MAX_RC_NUM_SWQE 8000 130 #define IBD_MIN_RC_NUM_RWQE 511 131 #define IBD_MAX_RC_NUM_RWQE 8000 132 #define IBD_MIN_RC_RX_COPY_THRESH 1500 133 #define IBD_MAX_RC_RX_COPY_THRESH 65520 134 #define IBD_MIN_RC_TX_COPY_THRESH 1500 135 #define IBD_MAX_RC_TX_COPY_THRESH 65520 136 #define IBD_MIN_RC_NUM_SRQ (IBD_MIN_RC_NUM_RWQE - 1) 137 #define IBD_MIN_RC_RX_RWQE_THRESH (IBD_MIN_RC_NUM_RWQE >> 2) 138 139 /* 140 * Thresholds 141 * 142 * When waiting for resources (swqes or lso buffers) to become available, 143 * the first two thresholds below determine how long to wait before informing 144 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 145 * determines how low the available swqes should go before we start polling 146 * the completion queue. 147 */ 148 #define IBD_FREE_LSOS_THRESH 8 149 #define IBD_FREE_SWQES_THRESH 20 150 #define IBD_TX_POLL_THRESH 80 151 152 #ifdef DEBUG 153 void debug_print(int l, char *fmt, ...); 154 #define DPRINT debug_print 155 #else 156 #define DPRINT 0 && 157 #endif 158 159 /* 160 * AH and MCE active list manipulation: 161 * 162 * Multicast disable requests and MCG delete traps are two cases 163 * where the active AH entry for the mcg (if any unreferenced one exists) 164 * will be moved to the free list (to force the next Tx to the mcg to 165 * join the MCG in SendOnly mode). Port up handling will also move AHs 166 * from active to free list. 167 * 168 * In the case when some transmits are still pending on an entry 169 * for an mcg, but a multicast disable has already been issued on the 170 * mcg, there are some options to consider to preserve the join state 171 * to ensure the emitted packet is properly routed on the IBA fabric. 172 * For the AH, we can 173 * 1. take out of active list at multicast disable time. 174 * 2. take out of active list only when last pending Tx completes. 175 * For the MCE, we can 176 * 3. take out of active list at multicast disable time. 177 * 4. take out of active list only when last pending Tx completes. 178 * 5. move from active list to stale list at multicast disable time. 179 * We choose to use 2,4. We use option 4 so that if a multicast enable 180 * is tried before the pending Tx completes, the enable code finds the 181 * mce in the active list and just has to make sure it will not be reaped 182 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 183 * a stale list (#5) that would be checked in the enable code would need 184 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 185 * after the multicast disable would try to put an AH in the active list, 186 * and associate the mce it finds in the active list to this new AH, 187 * whereas the mce is already associated with the previous AH (taken off 188 * the active list), and will be removed once the pending Tx's complete 189 * (unless a reference count on mce's is implemented). One implication of 190 * using 2,4 is that new Tx's posted before the pending Tx's complete will 191 * grab new references on the AH, further delaying the leave. 192 * 193 * In the case of mcg delete (or create) trap when the port is sendonly 194 * joined, the AH and MCE handling is different: the AH and MCE has to be 195 * immediately taken off the active lists (forcing a join and path lookup 196 * at the next Tx is the only guaranteed means of ensuring a proper Tx 197 * to an mcg as it is repeatedly created and deleted and goes thru 198 * reincarnations). 199 * 200 * When a port is already sendonly joined, and a multicast enable is 201 * attempted, the same mce structure is promoted; this ensures only a 202 * single mce on the active list tracks the most powerful join state. 203 * 204 * In the case of port up event handling, the MCE for sendonly membership 205 * is freed up, and the ACE is put into the free list as soon as possible 206 * (depending on whether posted Tx's have completed). For fullmembership 207 * MCE's though, the ACE is similarly handled; but the MCE is kept around 208 * (a re-JOIN is attempted) only if the DLPI leave has not already been 209 * done; else the mce is deconstructed (mc_fullreap case). 210 * 211 * MCG creation and deletion trap handling: 212 * 213 * These traps are unreliable (meaning sometimes the trap might never 214 * be delivered to the subscribed nodes) and may arrive out-of-order 215 * since they use UD transport. An alternative to relying on these 216 * unreliable traps is to poll for mcg presence every so often, but 217 * instead of doing that, we try to be as conservative as possible 218 * while handling the traps, and hope that the traps do arrive at 219 * the subscribed nodes soon. Note that if a node is fullmember 220 * joined to an mcg, it can not possibly receive a mcg create/delete 221 * trap for that mcg (by fullmember definition); if it does, it is 222 * an old trap from a previous incarnation of the mcg. 223 * 224 * Whenever a trap is received, the driver cleans up its sendonly 225 * membership to the group; we choose to do a sendonly leave even 226 * on a creation trap to handle the case of a prior deletion of the mcg 227 * having gone unnoticed. Consider an example scenario: 228 * T1: MCG M is deleted, and fires off deletion trap D1. 229 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 230 * T3: Node N tries to transmit to M, joining in sendonly mode. 231 * T4: MCG M is deleted, and fires off deletion trap D2. 232 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 233 * If the trap is D2, then a LEAVE is not required, since the mcg 234 * is already deleted; but if it is D1, a LEAVE is required. A safe 235 * approach is to always LEAVE, but the SM may be confused if it 236 * receives a LEAVE without a prior JOIN. 237 * 238 * Management of the non-membership to an mcg is similar to the above, 239 * except that if the interface is in promiscuous mode, it is required 240 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 241 * if the re-join attempt fails (in which case a warning message needs 242 * to be printed), it is not clear whether it failed due to the mcg not 243 * existing, or some fabric/hca issues, due to the delayed nature of 244 * trap delivery. Querying the SA to establish presence/absence of the 245 * mcg is also racy at best. Thus, the driver just prints a warning 246 * message when it can not rejoin after receiving a create trap, although 247 * this might be (on rare occasions) a mis-warning if the create trap is 248 * received after the mcg was deleted. 249 */ 250 251 /* 252 * Implementation of atomic "recycle" bits and reference count 253 * on address handles. This utilizes the fact that max reference 254 * count on any handle is limited by number of send wqes, thus 255 * high bits in the ac_ref field can be used as the recycle bits, 256 * and only the low bits hold the number of pending Tx requests. 257 * This atomic AH reference counting allows the Tx completion 258 * handler not to acquire the id_ac_mutex to process every completion, 259 * thus reducing lock contention problems between completion and 260 * the Tx path. 261 */ 262 #define CYCLEVAL 0x80000 263 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 264 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 265 #define GET_REF(ace) ((ace)->ac_ref) 266 #define GET_REF_CYCLE(ace) ( \ 267 /* \ 268 * Make sure "cycle" bit is set. \ 269 */ \ 270 ASSERT(CYCLE_SET(ace)), \ 271 ((ace)->ac_ref & ~(CYCLEVAL)) \ 272 ) 273 #define INC_REF(ace, num) { \ 274 atomic_add_32(&(ace)->ac_ref, num); \ 275 } 276 #define SET_CYCLE_IF_REF(ace) ( \ 277 CYCLE_SET(ace) ? B_TRUE : \ 278 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 279 CYCLEVAL ? \ 280 /* \ 281 * Clear the "cycle" bit we just set; \ 282 * ref count known to be 0 from above. \ 283 */ \ 284 CLEAR_REFCYCLE(ace), B_FALSE : \ 285 /* \ 286 * We set "cycle" bit; let caller know. \ 287 */ \ 288 B_TRUE \ 289 ) 290 #define DEC_REF_DO_CYCLE(ace) ( \ 291 atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \ 292 /* \ 293 * Ref count known to be 0 from above. \ 294 */ \ 295 B_TRUE : \ 296 B_FALSE \ 297 ) 298 299 /* 300 * Address handle entries maintained by the driver are kept in the 301 * free and active lists. Each entry starts out in the free list; 302 * it migrates to the active list when primed using ibt_get_paths() 303 * and ibt_modify_ud_dest() for transmission to a specific destination. 304 * In the active list, the entry has a reference count indicating the 305 * number of ongoing/uncompleted transmits that reference it. The 306 * entry is left in the active list even after the reference count 307 * goes to 0, since successive transmits can find it there and do 308 * not need to set up another entry (ie the path information is 309 * cached using the active list). Entries on the active list are 310 * also hashed using the destination link address as a key for faster 311 * lookups during transmits. 312 * 313 * For any destination address (unicast or multicast, whatever the 314 * join states), there will be at most one entry in the active list. 315 * Entries with a 0 reference count on the active list can be reused 316 * for a transmit to a new destination, if the free list is empty. 317 * 318 * The AH free list insertion/deletion is protected with the id_ac_mutex, 319 * since the async thread and Tx callback handlers insert/delete. The 320 * active list does not need a lock (all operations are done by the 321 * async thread) but updates to the reference count are atomically 322 * done (increments done by Tx path, decrements by the Tx callback handler). 323 */ 324 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 325 list_insert_head(&state->id_ah_free, ce) 326 #define IBD_ACACHE_GET_FREE(state) \ 327 list_get_head(&state->id_ah_free) 328 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 329 int _ret_; \ 330 list_insert_head(&state->id_ah_active, ce); \ 331 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 332 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 333 ASSERT(_ret_ == 0); \ 334 state->id_ac_hot_ace = ce; \ 335 } 336 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 337 list_remove(&state->id_ah_active, ce); \ 338 if (state->id_ac_hot_ace == ce) \ 339 state->id_ac_hot_ace = NULL; \ 340 (void) mod_hash_remove(state->id_ah_active_hash, \ 341 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 342 } 343 #define IBD_ACACHE_GET_ACTIVE(state) \ 344 list_get_head(&state->id_ah_active) 345 346 /* 347 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 348 * front of optional src/tgt link layer address. Right now Solaris inserts 349 * padding by default at the end. The routine which is doing is nce_xmit() 350 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 351 * the packet comes down from IP layer to the IBD driver, it is in the 352 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 353 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 354 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 355 * 356 * The send routine at IBD driver changes this packet as follows: 357 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 358 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 359 * aligned. 360 * 361 * At the receiving side again ibd_process_rx takes the above packet and 362 * removes the two bytes of front padding and inserts it at the end. This 363 * is since the IP layer does not understand padding at the front. 364 */ 365 #define IBD_PAD_NSNA(ip6h, len, type) { \ 366 uchar_t *nd_lla_ptr; \ 367 icmp6_t *icmp6; \ 368 nd_opt_hdr_t *opt; \ 369 int i; \ 370 \ 371 icmp6 = (icmp6_t *)&ip6h[1]; \ 372 len -= sizeof (nd_neighbor_advert_t); \ 373 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 374 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 375 (len != 0)) { \ 376 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 377 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 378 ASSERT(opt != NULL); \ 379 nd_lla_ptr = (uchar_t *)&opt[1]; \ 380 if (type == IBD_SEND) { \ 381 for (i = IPOIB_ADDRL; i > 0; i--) \ 382 *(nd_lla_ptr + i + 1) = \ 383 *(nd_lla_ptr + i - 1); \ 384 } else { \ 385 for (i = 0; i < IPOIB_ADDRL; i++) \ 386 *(nd_lla_ptr + i) = \ 387 *(nd_lla_ptr + i + 2); \ 388 } \ 389 *(nd_lla_ptr + i) = 0; \ 390 *(nd_lla_ptr + i + 1) = 0; \ 391 } \ 392 } 393 394 395 /* 396 * IETF defined IPoIB encapsulation header, with 2b of ethertype 397 * followed by 2 reserved bytes. This is at the start of the 398 * datagram sent to and received over the wire by the driver. 399 */ 400 typedef struct ipoib_header { 401 ushort_t ipoib_type; 402 ushort_t ipoib_mbz; 403 } ipoib_hdr_t; 404 405 #define IPOIB_HDRSIZE sizeof (struct ipoib_header) 406 407 /* 408 * IETF defined IPoIB link address; IBA QPN, followed by GID, 409 * which has a prefix and suffix, as reported via ARP. 410 */ 411 typedef struct ipoib_mac { 412 uint32_t ipoib_qpn; 413 uint32_t ipoib_gidpref[2]; 414 uint32_t ipoib_gidsuff[2]; 415 } ipoib_mac_t; 416 417 #define IPOIB_ADDRL sizeof (struct ipoib_mac) 418 419 /* 420 * Pseudo header prepended to datagram in DLIOCRAW transmit path 421 * and when GLD hands the datagram to the gldm_send entry point. 422 */ 423 typedef struct ipoib_ptxhdr { 424 ipoib_mac_t ipoib_dest; 425 ipoib_hdr_t ipoib_rhdr; 426 } ipoib_ptxhdr_t; 427 428 #define IPOIBDLSAP(p, offset) ((ipoib_ptxhdr_t *)((caddr_t)(p)+offset)) 429 430 /* 431 * The pseudo-GRH structure that sits before the data in the 432 * receive buffer, and is overlaid on top of the real GRH. 433 * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH 434 * does not hold valid information. If it is indicated valid, 435 * the driver must additionally provide the sender's qpn in 436 * network byte order in ipoib_sqpn, and not touch the 437 * remaining parts which were DMA'ed in by the IBA hardware. 438 */ 439 typedef struct ipoib_pgrh { 440 uint32_t ipoib_vertcflow; 441 uint32_t ipoib_sqpn; 442 uint32_t ipoib_sgid_pref[2]; 443 uint32_t ipoib_sgid_suff[2]; 444 uint32_t ipoib_dgid_pref[2]; 445 uint32_t ipoib_dgid_suff[2]; 446 } ipoib_pgrh_t; 447 448 /* 449 * The GRH is also dma'ed into recv buffers, thus space needs 450 * to be allocated for them. 451 */ 452 #define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t) 453 454 /* support the RC (reliable connected) mode */ 455 #define IBD_MAC_ADDR_RC 0x80000000 456 /* support the UC (unreliable connected) mode */ 457 #define IBD_MAC_ADDR_UC 0x40000000 458 459 #define IBD_RC_SERVICE_ID 0x100000000000000ULL 460 461 /* 462 * Legacy OFED had used a wrong service ID (one additional zero digit) for 463 * many years. To interop with legacy OFED, we support this wrong service ID 464 * here. 465 */ 466 #define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL 467 468 #define IBD_RC_MIN_CQ_SIZE 0x7f 469 470 /* Number of ibt_wc_t provided for each RC channel */ 471 #define IBD_RC_MAX_CQ_WC 0x3f 472 473 #if defined(_KERNEL) && !defined(_BOOT) 474 475 #include <sys/ib/ibtl/ibti.h> 476 #include <sys/ib/ib_pkt_hdrs.h> 477 #include <sys/list.h> 478 #include <sys/mac_provider.h> 479 #include <sys/mac_ib.h> 480 #include <sys/modhash.h> 481 482 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */ 483 typedef enum { 484 IBD_RC_STATE_INIT = 0, 485 486 /* Active side */ 487 IBD_RC_STATE_ACT_REP_RECV, /* reply received */ 488 IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */ 489 IBD_RC_STATE_ACT_REJECT, /* rejected */ 490 /* Someone else is closing this channel, please don't re-close it */ 491 IBD_RC_STATE_ACT_CLOSING, 492 IBD_RC_STATE_ACT_CLOSED, 493 IBD_RC_STATE_ACT_ERROR, 494 495 /* Passive side */ 496 IBD_RC_STATE_PAS_REQ_RECV, /* request received */ 497 IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */ 498 IBD_RC_STATE_PAS_REJECT, /* rejected */ 499 500 IBD_RC_STATE_PAS_CLOSED 501 } ibd_rc_chan_state_t; 502 503 /* 504 * Structure to encapsulate various types of async requests. 505 */ 506 typedef struct ibd_acache_rq { 507 struct list_node rq_list; /* list of pending work */ 508 int rq_op; /* what operation */ 509 ipoib_mac_t rq_mac; 510 ib_gid_t rq_gid; 511 void *rq_ptr; 512 void *rq_ptr2; 513 } ibd_req_t; 514 515 typedef struct ibd_mcache { 516 struct list_node mc_list; /* full/non list */ 517 uint8_t mc_jstate; 518 boolean_t mc_fullreap; 519 ibt_mcg_info_t mc_info; 520 ibd_req_t mc_req; /* to queue LEAVE req */ 521 } ibd_mce_t; 522 523 typedef struct ibd_acache_s { 524 struct list_node ac_list; /* free/active list */ 525 ibt_ud_dest_hdl_t ac_dest; 526 ipoib_mac_t ac_mac; 527 uint32_t ac_ref; 528 ibd_mce_t *ac_mce; /* for MCG AHs */ 529 530 /* For Reliable Connected mode */ 531 struct ibd_rc_chan_s *ac_chan; 532 /* protect tx_too_big_ongoing */ 533 kmutex_t tx_too_big_mutex; 534 /* Deal with too big packet */ 535 boolean_t tx_too_big_ongoing; 536 } ibd_ace_t; 537 538 #define IBD_MAX_SQSEG 59 539 #define IBD_MAX_RQSEG 1 540 541 typedef enum { 542 IBD_WQE_SEND, 543 IBD_WQE_RECV 544 } ibd_wqe_type_t; 545 546 typedef enum { 547 IBD_WQE_TXBUF = 1, 548 IBD_WQE_LSOBUF = 2, 549 IBD_WQE_MAPPED = 3, 550 IBD_WQE_RC_COPYBUF = 4 551 } ibd_wqe_buftype_t; 552 553 #ifdef DEBUG 554 typedef struct ibd_rc_stat_s { 555 kstat_named_t rc_rcv_trans_byte; 556 kstat_named_t rc_rcv_trans_pkt; 557 kstat_named_t rc_rcv_copy_byte; 558 kstat_named_t rc_rcv_copy_pkt; 559 kstat_named_t rc_rcv_alloc_fail; 560 561 kstat_named_t rc_rcq_invoke; 562 kstat_named_t rc_rcq_err; /* fail in rcq handler */ 563 kstat_named_t rc_scq_invoke; 564 565 kstat_named_t rc_rwqe_short; /* short rwqe */ 566 567 kstat_named_t rc_xmt_bytes; 568 /* pkt size <= state->id_rc_tx_copy_thresh */ 569 kstat_named_t rc_xmt_small_pkt; 570 kstat_named_t rc_xmt_fragmented_pkt; 571 /* fail in ibt_map_mem_iov() */ 572 kstat_named_t rc_xmt_map_fail_pkt; 573 /* succ in ibt_map_mem_iov() */ 574 kstat_named_t rc_xmt_map_succ_pkt; 575 576 kstat_named_t rc_ace_not_found; /* ace not found */ 577 /* no swqe even after recycle */ 578 kstat_named_t rc_scq_no_swqe; 579 /* no tx large buf even after recycle */ 580 kstat_named_t rc_scq_no_largebuf; 581 582 /* short swqe in ibd_send() */ 583 kstat_named_t rc_swqe_short; 584 /* call mac_tx_update() when there is enough swqe */ 585 kstat_named_t rc_swqe_mac_update; 586 /* short large buf in ibd_send() */ 587 kstat_named_t rc_xmt_buf_short; 588 /* call mac_tx_update() when there is enough Tx large buffers */ 589 kstat_named_t rc_xmt_buf_mac_update; 590 591 kstat_named_t rc_conn_succ; /* # of success connect */ 592 kstat_named_t rc_conn_fail; /* # of fail connect */ 593 /* ace->ac_chan == NULL for unicast packet */ 594 kstat_named_t rc_null_conn; 595 /* not in active established state */ 596 kstat_named_t rc_no_estab_conn; 597 598 kstat_named_t rc_act_close; /* call ibd_rc_act_close() */ 599 kstat_named_t rc_pas_close; /* call ibd_rc_pas_close() */ 600 kstat_named_t rc_delay_ace_recycle; 601 kstat_named_t rc_act_close_simultaneous; 602 603 kstat_named_t rc_reset_cnt; /* # of Reset RC channel */ 604 } ibd_rc_stat_t; 605 #endif 606 607 typedef struct ibd_rc_chan_list_s { 608 /* This mutex protects chan_list and ibd_rc_chan_t.next */ 609 kmutex_t chan_list_mutex; 610 struct ibd_rc_chan_s *chan_list; 611 } ibd_rc_chan_list_t; 612 613 typedef struct ibd_rc_tx_largebuf_s { 614 struct ibd_rc_tx_largebuf_s *lb_next; 615 uint8_t *lb_buf; 616 } ibd_rc_tx_largebuf_t; 617 618 /* 619 * Pre-registered copybuf used for send and receive 620 */ 621 typedef struct ibd_copybuf_s { 622 ibt_wr_ds_t ic_sgl; 623 uint8_t *ic_bufaddr; 624 } ibd_copybuf_t; 625 626 typedef struct ibd_wqe_s { 627 struct ibd_wqe_s *w_next; 628 ibd_copybuf_t w_copybuf; 629 mblk_t *im_mblk; 630 } ibd_wqe_t; 631 632 /* 633 * Send WQE 634 */ 635 typedef struct ibd_swqe_s { 636 ibd_wqe_t w_ibd_swqe; 637 ibd_wqe_buftype_t w_buftype; 638 ibt_send_wr_t w_swr; 639 ibd_ace_t *w_ahandle; 640 ibt_mi_hdl_t w_mi_hdl; 641 ibt_wr_ds_t w_sgl[IBD_MAX_SQSEG]; 642 ibd_rc_tx_largebuf_t *w_rc_tx_largebuf; 643 } ibd_swqe_t; 644 645 #define swqe_next w_ibd_swqe.w_next 646 #define swqe_copybuf w_ibd_swqe.w_copybuf 647 #define swqe_im_mblk w_ibd_swqe.im_mblk 648 #define SWQE_TO_WQE(swqe) (ibd_wqe_t *)&((swqe)->w_ibd_swqe) 649 #define WQE_TO_SWQE(wqe) (ibd_swqe_t *)wqe 650 651 /* 652 * Receive WQE 653 */ 654 typedef struct ibd_rwqe_s { 655 ibd_wqe_t w_ibd_rwqe; 656 struct ibd_state_s *w_state; 657 ibt_recv_wr_t w_rwr; 658 frtn_t w_freemsg_cb; 659 boolean_t w_freeing_wqe; 660 struct ibd_rc_chan_s *w_chan; 661 } ibd_rwqe_t; 662 663 #define rwqe_next w_ibd_rwqe.w_next 664 #define rwqe_copybuf w_ibd_rwqe.w_copybuf 665 #define rwqe_im_mblk w_ibd_rwqe.im_mblk 666 #define RWQE_TO_WQE(rwqe) (ibd_wqe_t *)&((rwqe)->w_ibd_rwqe) 667 #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe 668 669 typedef struct ibd_list_s { 670 kmutex_t dl_mutex; 671 ibd_wqe_t *dl_head; 672 union { 673 boolean_t pending_sends; 674 uint32_t bufs_outstanding; 675 } ustat; 676 uint32_t dl_cnt; 677 } ibd_list_t; 678 679 #define dl_pending_sends ustat.pending_sends 680 #define dl_bufs_outstanding ustat.bufs_outstanding 681 682 /* 683 * LSO buffers 684 * 685 * Under normal circumstances we should never need to use any buffer 686 * that's larger than MTU. Unfortunately, IB HCA has limitations 687 * on the length of SGL that are much smaller than those for regular 688 * ethernet NICs. Since the network layer doesn't care to limit the 689 * number of mblk fragments in any send mp chain, we end up having to 690 * use these larger-than-MTU sized (larger than id_tx_buf_sz actually) 691 * buffers occasionally. 692 */ 693 typedef struct ibd_lsobuf_s { 694 struct ibd_lsobuf_s *lb_next; 695 uint8_t *lb_buf; 696 int lb_isfree; 697 } ibd_lsobuf_t; 698 699 typedef struct ibd_lsobkt_s { 700 uint8_t *bkt_mem; 701 ibd_lsobuf_t *bkt_bufl; 702 ibd_lsobuf_t *bkt_free_head; 703 ibt_mr_hdl_t bkt_mr_hdl; 704 ibt_mr_desc_t bkt_mr_desc; 705 uint_t bkt_nelem; 706 uint_t bkt_nfree; 707 } ibd_lsobkt_t; 708 709 #define IBD_PORT_DRIVER 0x1 710 #define IBD_PARTITION_OBJ 0x2 711 712 /* 713 * Posting to a single software rx post queue is contentious, 714 * so break it out to (multiple) an array of queues. 715 * 716 * Try to ensure rx_queue structs fall in different cache lines using a filler. 717 * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes. 718 */ 719 #define RX_QUEUE_CACHE_LINE \ 720 (64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t))) 721 typedef struct ibd_rx_queue_s { 722 kmutex_t rx_post_lock; 723 ibd_wqe_t *rx_head; 724 uint_t rx_cnt; 725 uint8_t rx_pad[RX_QUEUE_CACHE_LINE]; 726 } ibd_rx_queue_t; 727 728 /* 729 * This structure maintains information per port per HCA 730 * (per network interface). 731 */ 732 typedef struct ibd_state_s { 733 uint_t id_type; 734 dev_info_t *id_dip; 735 ibt_clnt_hdl_t id_ibt_hdl; 736 ibt_hca_hdl_t id_hca_hdl; 737 ibt_pd_hdl_t id_pd_hdl; 738 kmem_cache_t *id_req_kmc; 739 740 ibd_list_t id_tx_rel_list; 741 742 uint32_t id_running; 743 744 uint32_t id_max_sqseg; 745 uint32_t id_max_sqseg_hiwm; 746 ibd_list_t id_tx_list; 747 ddi_softintr_t id_tx; 748 uint32_t id_tx_sends; 749 750 kmutex_t id_txpost_lock; 751 ibd_swqe_t *id_tx_head; 752 ibd_swqe_t *id_tx_tail; 753 int id_tx_busy; 754 755 uint_t id_tx_buf_sz; 756 uint8_t *id_tx_bufs; 757 ibd_swqe_t *id_tx_wqes; 758 ibt_mr_hdl_t id_tx_mr_hdl; 759 ibt_mr_desc_t id_tx_mr_desc; 760 761 kmutex_t id_lso_lock; 762 ibd_lsobkt_t *id_lso; 763 764 kmutex_t id_scq_poll_lock; 765 int id_scq_poll_busy; 766 767 ibt_cq_hdl_t id_scq_hdl; 768 ibt_wc_t *id_txwcs; 769 uint32_t id_txwcs_size; 770 771 int id_rx_nqueues; 772 ibd_rx_queue_t *id_rx_queues; 773 int id_rx_post_queue_index; 774 uint32_t id_rx_post_active; 775 776 ibd_rwqe_t *id_rx_wqes; 777 uint8_t *id_rx_bufs; 778 ibt_mr_hdl_t id_rx_mr_hdl; 779 ibt_mr_desc_t id_rx_mr_desc; 780 uint_t id_rx_buf_sz; 781 /* 782 * id_ud_num_rwqe 783 * Number of "receive WQE" elements that will be allocated and used 784 * by ibd. This parameter is limited by the maximum channel size of 785 * the HCA. Each buffer in the receive wqe will be of MTU size. 786 */ 787 uint32_t id_ud_num_rwqe; 788 ibd_list_t id_rx_list; 789 ddi_softintr_t id_rx; 790 uint32_t id_rx_bufs_outstanding_limit; 791 uint32_t id_rx_allocb; 792 uint32_t id_rx_allocb_failed; 793 ibd_list_t id_rx_free_list; 794 795 kmutex_t id_rcq_poll_lock; 796 int id_rcq_poll_busy; 797 uint32_t id_rxwcs_size; 798 ibt_wc_t *id_rxwcs; 799 ibt_cq_hdl_t id_rcq_hdl; 800 801 ibt_channel_hdl_t id_chnl_hdl; 802 ib_pkey_t id_pkey; 803 uint16_t id_pkix; 804 uint8_t id_port; 805 ibt_mcg_info_t *id_mcinfo; 806 807 mac_handle_t id_mh; 808 mac_resource_handle_t id_rh; 809 ib_gid_t id_sgid; 810 ib_qpn_t id_qpnum; 811 ipoib_mac_t id_macaddr; 812 ib_gid_t id_mgid; 813 ipoib_mac_t id_bcaddr; 814 815 int id_mtu; 816 uchar_t id_scope; 817 818 kmutex_t id_acache_req_lock; 819 kcondvar_t id_acache_req_cv; 820 struct list id_req_list; 821 kt_did_t id_async_thrid; 822 823 kmutex_t id_ac_mutex; 824 ibd_ace_t *id_ac_hot_ace; 825 struct list id_ah_active; 826 struct list id_ah_free; 827 ipoib_mac_t id_ah_addr; 828 ibd_req_t id_ah_req; 829 char id_ah_op; 830 uint64_t id_ah_error; 831 ibd_ace_t *id_ac_list; 832 mod_hash_t *id_ah_active_hash; 833 834 kmutex_t id_mc_mutex; 835 struct list id_mc_full; 836 struct list id_mc_non; 837 838 kmutex_t id_trap_lock; 839 kcondvar_t id_trap_cv; 840 boolean_t id_trap_stop; 841 uint32_t id_trap_inprog; 842 843 char id_prom_op; 844 845 kmutex_t id_sched_lock; 846 int id_sched_needed; 847 int id_sched_cnt; 848 int id_sched_lso_cnt; 849 850 kmutex_t id_link_mutex; 851 link_state_t id_link_state; 852 uint64_t id_link_speed; 853 854 uint64_t id_num_intrs; 855 uint64_t id_tx_short; 856 /* 857 * id_ud_num_swqe 858 * Number of "send WQE" elements that will be allocated and used by 859 * ibd. When tuning this parameter, the size of pre-allocated, pre- 860 * mapped copy buffer in each of these send wqes must be taken into 861 * account. This copy buffer size is determined by the value of 862 * IBD_TX_BUF_SZ (this is currently set to the same value of 863 * ibd_tx_copy_thresh, but may be changed independently if needed). 864 */ 865 uint32_t id_ud_num_swqe; 866 867 uint64_t id_xmt_bytes; 868 uint64_t id_rcv_bytes; 869 uint64_t id_multi_xmt; 870 uint64_t id_brd_xmt; 871 uint64_t id_multi_rcv; 872 uint64_t id_brd_rcv; 873 uint64_t id_xmt_pkt; 874 uint64_t id_rcv_pkt; 875 876 uint32_t id_hwcksum_capab; 877 boolean_t id_lso_policy; 878 boolean_t id_lso_capable; 879 uint_t id_lso_maxlen; 880 int id_hca_res_lkey_capab; 881 ibt_lkey_t id_res_lkey; 882 883 boolean_t id_bgroup_created; 884 kmutex_t id_macst_lock; 885 kcondvar_t id_macst_cv; 886 uint32_t id_mac_state; 887 888 /* For Reliable Connected Mode */ 889 boolean_t id_enable_rc; 890 boolean_t rc_enable_srq; 891 892 int rc_mtu; 893 uint32_t rc_tx_max_sqseg; 894 /* 895 * In IPoIB over Reliable Connected mode, its mac address is added 896 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function 897 * ibd_process_rx(), the input mac address should not include the 898 * "IBD_MAC_ADDR_RC" prefix. 899 * 900 * So, we introduce the rc_macaddr_loopback for the loopback filter in 901 * IPoIB over Reliable Connected mode. 902 * 903 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix. 904 */ 905 ipoib_mac_t rc_macaddr_loopback; 906 907 ibt_srv_hdl_t rc_listen_hdl; 908 ibt_sbind_hdl_t rc_listen_bind; 909 ibt_srv_hdl_t rc_listen_hdl_OFED_interop; 910 ibt_sbind_hdl_t rc_listen_bind_OFED_interop; 911 912 ibd_rc_chan_list_t rc_pass_chan_list; 913 /* obsolete active channel list */ 914 ibd_rc_chan_list_t rc_obs_act_chan_list; 915 916 kmutex_t rc_ace_recycle_lock; 917 ibd_ace_t *rc_ace_recycle; 918 919 /* Send */ 920 /* 921 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree 922 * and ibd_rc_tx_largebuf_t->lb_next 923 */ 924 kmutex_t rc_tx_large_bufs_lock; 925 ibd_rc_tx_largebuf_t *rc_tx_largebuf_free_head; 926 uint_t rc_tx_largebuf_nfree; 927 /* The chunk of whole Tx large buffers */ 928 uint8_t *rc_tx_mr_bufs; 929 ibt_mr_hdl_t rc_tx_mr_hdl; 930 ibt_mr_desc_t rc_tx_mr_desc; 931 ibd_rc_tx_largebuf_t *rc_tx_largebuf_desc_base; /* base addr */ 932 933 boolean_t rc_enable_iov_map; 934 uint_t rc_max_sqseg_hiwm; 935 936 /* For SRQ */ 937 uint32_t rc_srq_size; 938 ibt_srq_hdl_t rc_srq_hdl; 939 ibd_list_t rc_srq_rwqe_list; 940 ibd_list_t rc_srq_free_list; 941 ibd_rwqe_t *rc_srq_rwqes; 942 uint8_t *rc_srq_rx_bufs; 943 ibt_mr_hdl_t rc_srq_rx_mr_hdl; 944 ibt_mr_desc_t rc_srq_rx_mr_desc; 945 946 /* For chained receive */ 947 kmutex_t rc_rx_lock; 948 mblk_t *rc_rx_mp; 949 mblk_t *rc_rx_mp_tail; 950 uint32_t rc_rx_mp_len; 951 952 /* Counters for RC mode */ 953 /* RX */ 954 /* 955 * # of Received packets. These packets are directly transferred to GLD 956 * without copy it 957 */ 958 uint64_t rc_rcv_trans_byte; 959 uint64_t rc_rcv_trans_pkt; 960 /* 961 * # of Received packets. We will allocate new buffers for these packet, 962 * copy their content into new buffers, then transfer to GLD 963 */ 964 uint64_t rc_rcv_copy_byte; 965 uint64_t rc_rcv_copy_pkt; 966 uint64_t rc_rcv_alloc_fail; 967 968 #ifdef DEBUG 969 uint64_t rc_rwqe_short; /* short rwqe */ 970 #endif 971 972 /* # of invoke Receive CQ handler */ 973 uint64_t rc_rcq_invoke; 974 /* wc->wc_status != IBT_WC_SUCCESS */ 975 uint64_t rc_rcq_err; 976 977 /* Tx */ 978 uint64_t rc_xmt_bytes; 979 980 /* pkt size <= ibd_rc_tx_copy_thresh */ 981 uint64_t rc_xmt_small_pkt; 982 uint64_t rc_xmt_fragmented_pkt; 983 /* fail in ibt_map_mem_iov() */ 984 uint64_t rc_xmt_map_fail_pkt; 985 /* succ in ibt_map_mem_iov() */ 986 uint64_t rc_xmt_map_succ_pkt; 987 988 uint64_t rc_ace_not_found; 989 990 uint64_t rc_xmt_drop_too_long_pkt; 991 uint64_t rc_xmt_icmp_too_long_pkt; 992 uint64_t rc_xmt_reenter_too_long_pkt; 993 994 /* short swqe in ibd_send() */ 995 uint64_t rc_swqe_short; 996 /* call mac_tx_update when there is enough swqe */ 997 uint64_t rc_swqe_mac_update; 998 /* short tx large copy buf in ibd_send() */ 999 uint64_t rc_xmt_buf_short; 1000 /* call mac_tx_update when there is enough Tx copy buf */ 1001 uint64_t rc_xmt_buf_mac_update; 1002 1003 /* No swqe even after call swqe recycle function */ 1004 uint64_t rc_scq_no_swqe; 1005 /* No large Tx buf even after call swqe recycle function */ 1006 uint64_t rc_scq_no_largebuf; 1007 /* # of invoke Send CQ handler */ 1008 uint64_t rc_scq_invoke; 1009 1010 /* Connection setup and close */ 1011 uint64_t rc_conn_succ; /* time of succ connect */ 1012 uint64_t rc_conn_fail; /* time of fail connect */ 1013 /* ace->ac_chan == NULL for unicast packet */ 1014 uint64_t rc_null_conn; 1015 /* not in active established state */ 1016 uint64_t rc_no_estab_conn; 1017 1018 uint64_t rc_act_close; /* call ibd_rc_act_close() */ 1019 uint64_t rc_pas_close; /* call ibd_rc_pas_close() */ 1020 uint64_t rc_delay_ace_recycle; 1021 uint64_t rc_act_close_simultaneous; 1022 1023 /* the counter of reset RC channel */ 1024 uint64_t rc_reset_cnt; 1025 1026 #ifdef DEBUG 1027 kstat_t *rc_ksp; 1028 #endif 1029 ib_guid_t id_hca_guid; 1030 ib_guid_t id_port_guid; 1031 datalink_id_t id_dlinkid; 1032 datalink_id_t id_plinkid; 1033 int id_port_inst; 1034 struct ibd_state_s *id_next; 1035 boolean_t id_force_create; 1036 boolean_t id_bgroup_present; 1037 uint_t id_hca_max_chan_sz; 1038 1039 /* 1040 * UD Mode Tunables 1041 * 1042 * id_ud_tx_copy_thresh 1043 * This sets the threshold at which ibd will attempt to do a bcopy 1044 * of the outgoing data into a pre-mapped buffer. IPoIB driver's 1045 * send behavior is restricted by various parameters, so setting of 1046 * this value must be made after careful considerations only. For 1047 * instance, IB HCAs currently impose a relatively small limit 1048 * (when compared to ethernet NICs) on the length of the SGL for 1049 * transmit. On the other hand, the ip stack could send down mp 1050 * chains that are quite long when LSO is enabled. 1051 * 1052 * id_num_lso_bufs 1053 * Number of "larger-than-MTU" copy buffers to use for cases when the 1054 * outgoing mblk chain is too fragmented to be used with 1055 * ibt_map_mem_iov() and too large to be used with regular MTU-sized 1056 * copy buffers. It is not recommended to tune this variable without 1057 * understanding the application environment and/or memory resources. 1058 * The size of each of these lso buffers is determined by the value of 1059 * IBD_LSO_BUFSZ. 1060 * 1061 * id_num_ah 1062 * Number of AH cache entries to allocate 1063 * 1064 * id_hash_size 1065 * Hash table size for the active AH list 1066 * 1067 */ 1068 uint_t id_ud_tx_copy_thresh; 1069 uint_t id_num_lso_bufs; 1070 uint_t id_num_ah; 1071 uint_t id_hash_size; 1072 1073 boolean_t id_create_broadcast_group; 1074 1075 boolean_t id_allow_coalesce_comp_tuning; 1076 uint_t id_ud_rx_comp_count; 1077 uint_t id_ud_rx_comp_usec; 1078 uint_t id_ud_tx_comp_count; 1079 uint_t id_ud_tx_comp_usec; 1080 1081 /* RC Mode Tunables */ 1082 1083 uint_t id_rc_rx_comp_count; 1084 uint_t id_rc_rx_comp_usec; 1085 uint_t id_rc_tx_comp_count; 1086 uint_t id_rc_tx_comp_usec; 1087 /* 1088 * id_rc_tx_copy_thresh 1089 * This sets the threshold at which ibd will attempt to do a bcopy 1090 * of the outgoing data into a pre-mapped buffer. 1091 * 1092 * id_rc_rx_copy_thresh 1093 * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd 1094 * will attempt to allocate a buffer and do a bcopy of the incoming 1095 * data into the allocated buffer. 1096 * 1097 * id_rc_rx_rwqe_thresh 1098 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd 1099 * will attempt to allocate a buffer and do a bcopy of the incoming 1100 * data into the allocated buffer. 1101 * 1102 * id_rc_num_swqe 1103 * 1) Send CQ size = ibd_rc_num_swqe 1104 * 2) The send queue size = ibd_rc_num_swqe -1 1105 * 3) Number of pre-allocated Tx buffers for ibt_post_send() = 1106 * ibd_rc_num_swqe - 1. 1107 * 1108 * id_rc_num_rwqe 1109 * 1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs 1110 * via ibt_post_receive() for receive queue of each RC channel. 1111 * 2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe 1112 * 1113 * For SRQ 1114 * If using SRQ, we allocate ibd_rc_num_srq number of buffers (the 1115 * size of each buffer is equal to RC mtu). And post them by 1116 * ibt_post_srq(). 1117 * 1118 * id_rc_num_srq 1119 * ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe, 1120 * otherwise it will cause a bug with the following warnings: 1121 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error 1122 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic 1123 * error 1124 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 1125 * catastrophic channel error 1126 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 1127 * completion queue error 1128 */ 1129 uint_t id_rc_tx_copy_thresh; 1130 uint_t id_rc_rx_copy_thresh; 1131 uint_t id_rc_rx_rwqe_thresh; 1132 uint_t id_rc_num_swqe; 1133 uint_t id_rc_num_rwqe; 1134 uint_t id_rc_num_srq; 1135 } ibd_state_t; 1136 1137 /* 1138 * Structures to track global IBTF data, data that is shared 1139 * among the IBD device instances. This includes the one ibt_hdl 1140 * and the list of service registrations. 1141 */ 1142 typedef struct ibd_service_s { 1143 struct ibd_service_s *is_link; 1144 ibt_srv_hdl_t is_srv_hdl; 1145 ib_svc_id_t is_sid; 1146 uint_t is_ref_cnt; 1147 } ibd_service_t; 1148 1149 typedef struct ibd_global_state_s { 1150 kmutex_t ig_mutex; 1151 ibt_clnt_hdl_t ig_ibt_hdl; 1152 uint_t ig_ibt_hdl_ref_cnt; 1153 ibd_service_t *ig_service_list; 1154 } ibd_global_state_t; 1155 1156 typedef struct ibd_rc_msg_hello_s { 1157 uint32_t reserved_qpn; 1158 uint32_t rx_mtu; 1159 } ibd_rc_msg_hello_t; 1160 1161 typedef struct ibd_rc_chan_s { 1162 struct ibd_rc_chan_s *next; 1163 /* channel hdl that we'll be using for Reliable Connected Mode */ 1164 ibt_channel_hdl_t chan_hdl; 1165 struct ibd_state_s *state; 1166 ibd_ace_t *ace; 1167 ibd_rc_chan_state_t chan_state; 1168 1169 /* used to judge duplicate connection */ 1170 ib_gid_t requester_gid; 1171 ib_pkey_t requester_pkey; 1172 1173 ibd_list_t tx_wqe_list; /* free wqe list */ 1174 ibd_list_t tx_rel_list; /* for swqe recycle */ 1175 1176 ibd_swqe_t *tx_wqes; 1177 1178 /* start address of Tx Buffers */ 1179 uint8_t *tx_mr_bufs; 1180 ibt_mr_hdl_t tx_mr_hdl; 1181 ibt_mr_desc_t tx_mr_desc; 1182 1183 ibt_cq_hdl_t scq_hdl; /* Tx completion queue */ 1184 ibt_wc_t tx_wc[IBD_RC_MAX_CQ_WC]; 1185 ddi_softintr_t scq_softintr; 1186 1187 uint32_t tx_trans_error_cnt; 1188 1189 /* For chained send */ 1190 kmutex_t tx_post_lock; 1191 ibd_swqe_t *tx_head; 1192 ibd_swqe_t *tx_tail; 1193 int tx_busy; 1194 1195 /* For tx buffer recycle */ 1196 kmutex_t tx_poll_lock; 1197 int tx_poll_busy; 1198 1199 /* Rx */ 1200 ibd_list_t rx_wqe_list; /* used by ibt_post_recv */ 1201 ibd_list_t rx_free_list; /* free rwqe list */ 1202 1203 ibt_cq_hdl_t rcq_hdl; /* Rx completion queue */ 1204 ibt_wc_t rx_wc[IBD_RC_MAX_CQ_WC]; 1205 1206 ibd_rwqe_t *rx_rwqes; /* the chuck of whole rwqes */ 1207 uint8_t *rx_bufs; /* the chuck of whole Rx bufs */ 1208 ibt_mr_hdl_t rx_mr_hdl; /* ibt_mr_hdl_t for rx_bufs */ 1209 ibt_mr_desc_t rx_mr_desc; /* ibt_mr_desc_t for rx_bufs */ 1210 1211 /* For chained receive */ 1212 kmutex_t rx_lock; 1213 mblk_t *rx_mp; 1214 mblk_t *rx_mp_tail; 1215 uint32_t rx_mp_len; 1216 1217 uint32_t rcq_size; 1218 uint32_t scq_size; 1219 /* 1220 * We need two channels for each connection. 1221 * One channel for Tx; another channel for Rx. 1222 * If "is_tx_chan == B_TRUE", this is a Tx channel. 1223 */ 1224 boolean_t is_tx_chan; 1225 } ibd_rc_chan_t; 1226 1227 /* 1228 * The following functions are defined in "ibd.c". 1229 * They are also used by "ibd_cm.c" 1230 */ 1231 void ibd_print_warn(ibd_state_t *, char *, ...); 1232 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 1233 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 1234 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 1235 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 1236 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 1237 1238 /* 1239 * The following functions are defined in "ibd_cm.c". 1240 * They are also used in "ibd.c". 1241 */ 1242 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *); 1243 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *); 1244 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *); 1245 1246 /* Connection Setup/Close Functions */ 1247 ibt_status_t ibd_rc_listen(ibd_state_t *); 1248 void ibd_rc_stop_listen(ibd_state_t *); 1249 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *, 1250 uint64_t); 1251 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *); 1252 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *); 1253 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *); 1254 void ibd_rc_close_all_chan(ibd_state_t *); 1255 1256 /* Receive Functions */ 1257 int ibd_rc_init_srq_list(ibd_state_t *); 1258 void ibd_rc_fini_srq_list(ibd_state_t *); 1259 int ibd_rc_repost_srq_free_list(ibd_state_t *); 1260 1261 /* Send Functions */ 1262 int ibd_rc_init_tx_largebuf_list(ibd_state_t *); 1263 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *); 1264 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *); 1265 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *); 1266 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t); 1267 void ibd_rc_tx_cleanup(ibd_swqe_t *); 1268 1269 /* Others */ 1270 void ibd_rc_get_conf(ibd_state_t *); 1271 int ibd_rc_init_stats(ibd_state_t *); 1272 1273 #endif /* _KERNEL && !_BOOT */ 1274 1275 #ifdef __cplusplus 1276 } 1277 #endif 1278 1279 #endif /* _SYS_IB_CLIENTS_IBD_H */ 1280