1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #ifndef _SYS_IB_CLIENTS_IBD_H 27 #define _SYS_IB_CLIENTS_IBD_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 /* The following macros are used in both ibd.c and ibd_cm.c */ 34 35 /* 36 * Completion queue polling control 37 */ 38 #define IBD_CQ_POLLING 0x1 39 #define IBD_REDO_CQ_POLLING 0x2 40 41 /* 42 * Maximum length for returning chained mps back to crossbow. 43 * Also used as the maximum number of rx wc's polled at a time. 44 */ 45 #define IBD_MAX_RX_MP_LEN 16 46 47 /* 48 * When doing multiple-send-wr, this value determines how many to do at 49 * a time (in a single ibt_post_send). 50 */ 51 #define IBD_MAX_TX_POST_MULTIPLE 4 52 53 /* 54 * Flag bits for resources to reap 55 */ 56 #define IBD_RSRC_SWQE 0x1 57 #define IBD_RSRC_LSOBUF 0x2 58 #define IBD_RSRC_RC_SWQE 0x4 59 #define IBD_RSRC_RC_TX_LARGEBUF 0x8 60 61 /* 62 * Async operation types 63 */ 64 #define IBD_ASYNC_GETAH 1 65 #define IBD_ASYNC_JOIN 2 66 #define IBD_ASYNC_LEAVE 3 67 #define IBD_ASYNC_PROMON 4 68 #define IBD_ASYNC_PROMOFF 5 69 #define IBD_ASYNC_REAP 6 70 #define IBD_ASYNC_TRAP 7 71 #define IBD_ASYNC_SCHED 8 72 #define IBD_ASYNC_LINK 9 73 #define IBD_ASYNC_EXIT 10 74 #define IBD_ASYNC_RC_TOO_BIG 11 75 #define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12 76 #define IBD_ASYNC_RC_RECYCLE_ACE 13 77 #define IBD_ASYNC_RC_CLOSE_PAS_CHAN 14 78 79 /* 80 * State of IBD driver initialization during attach/m_start 81 */ 82 #define IBD_DRV_STATE_INITIALIZED 0x000001 83 #define IBD_DRV_RXINTR_ADDED 0x000002 84 #define IBD_DRV_TXINTR_ADDED 0x000004 85 #define IBD_DRV_IBTL_ATTACH_DONE 0x000008 86 #define IBD_DRV_HCA_OPENED 0x000010 87 #define IBD_DRV_PD_ALLOCD 0x000020 88 #define IBD_DRV_MAC_REGISTERED 0x000040 89 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080 90 #define IBD_DRV_BCAST_GROUP_FOUND 0x000100 91 #define IBD_DRV_ACACHE_INITIALIZED 0x000200 92 #define IBD_DRV_CQS_ALLOCD 0x000400 93 #define IBD_DRV_UD_CHANNEL_SETUP 0x000800 94 #define IBD_DRV_TXLIST_ALLOCD 0x001000 95 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000 96 #define IBD_DRV_RXLIST_ALLOCD 0x004000 97 #define IBD_DRV_BCAST_GROUP_JOINED 0x008000 98 #define IBD_DRV_ASYNC_THR_CREATED 0x010000 99 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000 100 #define IBD_DRV_SM_NOTICES_REGISTERED 0x040000 101 #define IBD_DRV_STARTED 0x080000 102 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 103 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 104 #define IBD_DRV_RC_LISTEN 0x400000 105 #ifdef DEBUG 106 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 107 #endif 108 #define IBD_DRV_IN_DELETION 0x1000000 109 #define IBD_DRV_IN_LATE_HCA_INIT 0x2000000 110 #define IBD_DRV_REQ_LIST_INITED 0x4000000 111 #define IBD_DRV_RC_TIMEOUT 0x8000000 112 113 /* 114 * Miscellaneous constants 115 */ 116 #define IBD_SEND 0 117 #define IBD_RECV 1 118 119 /* Tunables defaults and limits */ 120 #define IBD_LINK_MODE_UD 0 121 #define IBD_LINK_MODE_RC 1 122 123 #define IBD_DEF_LINK_MODE IBD_LINK_MODE_RC 124 #define IBD_DEF_LSO_POLICY B_TRUE 125 #define IBD_DEF_NUM_LSO_BUFS 1024 126 #define IBD_DEF_CREATE_BCAST_GROUP B_TRUE 127 #define IBD_DEF_COALESCE_COMPLETIONS B_TRUE 128 #define IBD_DEF_UD_RX_COMP_COUNT 4 129 #define IBD_DEF_UD_RX_COMP_USEC 10 130 #define IBD_DEF_UD_TX_COMP_COUNT 16 131 #define IBD_DEF_UD_TX_COMP_USEC 300 132 #define IBD_DEF_RC_RX_COMP_COUNT 4 133 #define IBD_DEF_RC_RX_COMP_USEC 10 134 #define IBD_DEF_RC_TX_COMP_COUNT 10 135 #define IBD_DEF_RC_TX_COMP_USEC 300 136 #define IBD_DEF_UD_TX_COPY_THRESH 4096 137 #define IBD_DEF_RC_RX_COPY_THRESH 4096 138 #define IBD_DEF_RC_TX_COPY_THRESH 4096 139 #define IBD_DEF_UD_NUM_RWQE 4000 140 #define IBD_DEF_UD_NUM_SWQE 4000 141 #define IBD_DEF_RC_ENABLE_SRQ B_TRUE 142 #if defined(__i386) 143 #define IBD_DEF_RC_NUM_RWQE 511 144 #define IBD_DEF_RC_NUM_SWQE 255 145 #else 146 #define IBD_DEF_RC_NUM_RWQE 2047 147 #define IBD_DEF_RC_NUM_SWQE 511 148 #endif 149 #define IBD_DEF_NUM_AH 256 150 #define IBD_DEF_HASH_SIZE 32 151 #define IBD_DEF_RC_NUM_SRQ (IBD_DEF_RC_NUM_RWQE - 1) 152 #define IBD_DEF_RC_RX_RWQE_THRESH (IBD_DEF_RC_NUM_RWQE >> 2) 153 154 /* Tunable limits */ 155 #define IBD_MIN_NUM_LSO_BUFS 512 156 #define IBD_MAX_NUM_LSO_BUFS 4096 157 #define IBD_MIN_UD_TX_COPY_THRESH 2048 158 #define IBD_MAX_UD_TX_COPY_THRESH 65536 159 #define IBD_MIN_UD_NUM_SWQE 512 160 #define IBD_MAX_UD_NUM_SWQE 8000 161 #define IBD_MIN_UD_NUM_RWQE 512 162 #define IBD_MAX_UD_NUM_RWQE 8000 163 #define IBD_MIN_NUM_AH 32 164 #define IBD_MAX_NUM_AH 8192 165 #define IBD_MIN_HASH_SIZE 32 166 #define IBD_MAX_HASH_SIZE 1024 167 168 #if defined(__i386) 169 #define IBD_MIN_RC_NUM_SWQE 255 170 #else 171 #define IBD_MIN_RC_NUM_SWQE 511 172 #endif 173 #define IBD_MAX_RC_NUM_SWQE 8000 174 #define IBD_MIN_RC_NUM_RWQE 511 175 #define IBD_MAX_RC_NUM_RWQE 8000 176 #define IBD_MIN_RC_RX_COPY_THRESH 1500 177 #define IBD_MAX_RC_RX_COPY_THRESH 65520 178 #define IBD_MIN_RC_TX_COPY_THRESH 1500 179 #define IBD_MAX_RC_TX_COPY_THRESH 65520 180 #define IBD_MIN_RC_NUM_SRQ (IBD_MIN_RC_NUM_RWQE - 1) 181 #define IBD_MIN_RC_RX_RWQE_THRESH (IBD_MIN_RC_NUM_RWQE >> 2) 182 183 /* 184 * Thresholds 185 * 186 * When waiting for resources (swqes or lso buffers) to become available, 187 * the first two thresholds below determine how long to wait before informing 188 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 189 * determines how low the available swqes should go before we start polling 190 * the completion queue. 191 */ 192 #define IBD_FREE_LSOS_THRESH 8 193 #define IBD_FREE_SWQES_THRESH 20 194 #define IBD_TX_POLL_THRESH 80 195 196 #ifdef DEBUG 197 void debug_print(int l, char *fmt, ...); 198 #define DPRINT debug_print 199 #else 200 #define DPRINT 0 && 201 #endif 202 203 /* 204 * AH and MCE active list manipulation: 205 * 206 * Multicast disable requests and MCG delete traps are two cases 207 * where the active AH entry for the mcg (if any unreferenced one exists) 208 * will be moved to the free list (to force the next Tx to the mcg to 209 * join the MCG in SendOnly mode). Port up handling will also move AHs 210 * from active to free list. 211 * 212 * In the case when some transmits are still pending on an entry 213 * for an mcg, but a multicast disable has already been issued on the 214 * mcg, there are some options to consider to preserve the join state 215 * to ensure the emitted packet is properly routed on the IBA fabric. 216 * For the AH, we can 217 * 1. take out of active list at multicast disable time. 218 * 2. take out of active list only when last pending Tx completes. 219 * For the MCE, we can 220 * 3. take out of active list at multicast disable time. 221 * 4. take out of active list only when last pending Tx completes. 222 * 5. move from active list to stale list at multicast disable time. 223 * We choose to use 2,4. We use option 4 so that if a multicast enable 224 * is tried before the pending Tx completes, the enable code finds the 225 * mce in the active list and just has to make sure it will not be reaped 226 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 227 * a stale list (#5) that would be checked in the enable code would need 228 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 229 * after the multicast disable would try to put an AH in the active list, 230 * and associate the mce it finds in the active list to this new AH, 231 * whereas the mce is already associated with the previous AH (taken off 232 * the active list), and will be removed once the pending Tx's complete 233 * (unless a reference count on mce's is implemented). One implication of 234 * using 2,4 is that new Tx's posted before the pending Tx's complete will 235 * grab new references on the AH, further delaying the leave. 236 * 237 * In the case of mcg delete (or create) trap when the port is sendonly 238 * joined, the AH and MCE handling is different: the AH and MCE has to be 239 * immediately taken off the active lists (forcing a join and path lookup 240 * at the next Tx is the only guaranteed means of ensuring a proper Tx 241 * to an mcg as it is repeatedly created and deleted and goes thru 242 * reincarnations). 243 * 244 * When a port is already sendonly joined, and a multicast enable is 245 * attempted, the same mce structure is promoted; this ensures only a 246 * single mce on the active list tracks the most powerful join state. 247 * 248 * In the case of port up event handling, the MCE for sendonly membership 249 * is freed up, and the ACE is put into the free list as soon as possible 250 * (depending on whether posted Tx's have completed). For fullmembership 251 * MCE's though, the ACE is similarly handled; but the MCE is kept around 252 * (a re-JOIN is attempted) only if the DLPI leave has not already been 253 * done; else the mce is deconstructed (mc_fullreap case). 254 * 255 * MCG creation and deletion trap handling: 256 * 257 * These traps are unreliable (meaning sometimes the trap might never 258 * be delivered to the subscribed nodes) and may arrive out-of-order 259 * since they use UD transport. An alternative to relying on these 260 * unreliable traps is to poll for mcg presence every so often, but 261 * instead of doing that, we try to be as conservative as possible 262 * while handling the traps, and hope that the traps do arrive at 263 * the subscribed nodes soon. Note that if a node is fullmember 264 * joined to an mcg, it can not possibly receive a mcg create/delete 265 * trap for that mcg (by fullmember definition); if it does, it is 266 * an old trap from a previous incarnation of the mcg. 267 * 268 * Whenever a trap is received, the driver cleans up its sendonly 269 * membership to the group; we choose to do a sendonly leave even 270 * on a creation trap to handle the case of a prior deletion of the mcg 271 * having gone unnoticed. Consider an example scenario: 272 * T1: MCG M is deleted, and fires off deletion trap D1. 273 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 274 * T3: Node N tries to transmit to M, joining in sendonly mode. 275 * T4: MCG M is deleted, and fires off deletion trap D2. 276 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 277 * If the trap is D2, then a LEAVE is not required, since the mcg 278 * is already deleted; but if it is D1, a LEAVE is required. A safe 279 * approach is to always LEAVE, but the SM may be confused if it 280 * receives a LEAVE without a prior JOIN. 281 * 282 * Management of the non-membership to an mcg is similar to the above, 283 * except that if the interface is in promiscuous mode, it is required 284 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 285 * if the re-join attempt fails (in which case a warning message needs 286 * to be printed), it is not clear whether it failed due to the mcg not 287 * existing, or some fabric/hca issues, due to the delayed nature of 288 * trap delivery. Querying the SA to establish presence/absence of the 289 * mcg is also racy at best. Thus, the driver just prints a warning 290 * message when it can not rejoin after receiving a create trap, although 291 * this might be (on rare occasions) a mis-warning if the create trap is 292 * received after the mcg was deleted. 293 */ 294 295 /* 296 * Implementation of atomic "recycle" bits and reference count 297 * on address handles. This utilizes the fact that max reference 298 * count on any handle is limited by number of send wqes, thus 299 * high bits in the ac_ref field can be used as the recycle bits, 300 * and only the low bits hold the number of pending Tx requests. 301 * This atomic AH reference counting allows the Tx completion 302 * handler not to acquire the id_ac_mutex to process every completion, 303 * thus reducing lock contention problems between completion and 304 * the Tx path. 305 */ 306 #define CYCLEVAL 0x80000 307 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 308 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 309 #define GET_REF(ace) ((ace)->ac_ref) 310 #define GET_REF_CYCLE(ace) ( \ 311 /* \ 312 * Make sure "cycle" bit is set. \ 313 */ \ 314 ASSERT(CYCLE_SET(ace)), \ 315 ((ace)->ac_ref & ~(CYCLEVAL)) \ 316 ) 317 #define INC_REF(ace, num) { \ 318 atomic_add_32(&(ace)->ac_ref, num); \ 319 } 320 #define SET_CYCLE_IF_REF(ace) ( \ 321 CYCLE_SET(ace) ? B_TRUE : \ 322 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 323 CYCLEVAL ? \ 324 /* \ 325 * Clear the "cycle" bit we just set; \ 326 * ref count known to be 0 from above. \ 327 */ \ 328 CLEAR_REFCYCLE(ace), B_FALSE : \ 329 /* \ 330 * We set "cycle" bit; let caller know. \ 331 */ \ 332 B_TRUE \ 333 ) 334 #define DEC_REF_DO_CYCLE(ace) ( \ 335 atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \ 336 /* \ 337 * Ref count known to be 0 from above. \ 338 */ \ 339 B_TRUE : \ 340 B_FALSE \ 341 ) 342 343 /* 344 * Address handle entries maintained by the driver are kept in the 345 * free and active lists. Each entry starts out in the free list; 346 * it migrates to the active list when primed using ibt_get_paths() 347 * and ibt_modify_ud_dest() for transmission to a specific destination. 348 * In the active list, the entry has a reference count indicating the 349 * number of ongoing/uncompleted transmits that reference it. The 350 * entry is left in the active list even after the reference count 351 * goes to 0, since successive transmits can find it there and do 352 * not need to set up another entry (ie the path information is 353 * cached using the active list). Entries on the active list are 354 * also hashed using the destination link address as a key for faster 355 * lookups during transmits. 356 * 357 * For any destination address (unicast or multicast, whatever the 358 * join states), there will be at most one entry in the active list. 359 * Entries with a 0 reference count on the active list can be reused 360 * for a transmit to a new destination, if the free list is empty. 361 * 362 * The AH free list insertion/deletion is protected with the id_ac_mutex, 363 * since the async thread and Tx callback handlers insert/delete. The 364 * active list does not need a lock (all operations are done by the 365 * async thread) but updates to the reference count are atomically 366 * done (increments done by Tx path, decrements by the Tx callback handler). 367 */ 368 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 369 list_insert_head(&state->id_ah_free, ce) 370 #define IBD_ACACHE_GET_FREE(state) \ 371 list_get_head(&state->id_ah_free) 372 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 373 int _ret_; \ 374 list_insert_head(&state->id_ah_active, ce); \ 375 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 376 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 377 ASSERT(_ret_ == 0); \ 378 state->id_ac_hot_ace = ce; \ 379 } 380 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 381 list_remove(&state->id_ah_active, ce); \ 382 if (state->id_ac_hot_ace == ce) \ 383 state->id_ac_hot_ace = NULL; \ 384 (void) mod_hash_remove(state->id_ah_active_hash, \ 385 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 386 } 387 #define IBD_ACACHE_GET_ACTIVE(state) \ 388 list_get_head(&state->id_ah_active) 389 390 /* 391 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 392 * front of optional src/tgt link layer address. Right now Solaris inserts 393 * padding by default at the end. The routine which is doing is nce_xmit() 394 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 395 * the packet comes down from IP layer to the IBD driver, it is in the 396 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 397 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 398 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 399 * 400 * The send routine at IBD driver changes this packet as follows: 401 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 402 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 403 * aligned. 404 * 405 * At the receiving side again ibd_process_rx takes the above packet and 406 * removes the two bytes of front padding and inserts it at the end. This 407 * is since the IP layer does not understand padding at the front. 408 */ 409 #define IBD_PAD_NSNA(ip6h, len, type) { \ 410 uchar_t *nd_lla_ptr; \ 411 icmp6_t *icmp6; \ 412 nd_opt_hdr_t *opt; \ 413 int i; \ 414 \ 415 icmp6 = (icmp6_t *)&ip6h[1]; \ 416 len -= sizeof (nd_neighbor_advert_t); \ 417 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 418 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 419 (len != 0)) { \ 420 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 421 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 422 ASSERT(opt != NULL); \ 423 nd_lla_ptr = (uchar_t *)&opt[1]; \ 424 if (type == IBD_SEND) { \ 425 for (i = IPOIB_ADDRL; i > 0; i--) \ 426 *(nd_lla_ptr + i + 1) = \ 427 *(nd_lla_ptr + i - 1); \ 428 } else { \ 429 for (i = 0; i < IPOIB_ADDRL; i++) \ 430 *(nd_lla_ptr + i) = \ 431 *(nd_lla_ptr + i + 2); \ 432 } \ 433 *(nd_lla_ptr + i) = 0; \ 434 *(nd_lla_ptr + i + 1) = 0; \ 435 } \ 436 } 437 438 439 /* 440 * IETF defined IPoIB encapsulation header, with 2b of ethertype 441 * followed by 2 reserved bytes. This is at the start of the 442 * datagram sent to and received over the wire by the driver. 443 */ 444 typedef struct ipoib_header { 445 ushort_t ipoib_type; 446 ushort_t ipoib_mbz; 447 } ipoib_hdr_t; 448 449 #define IPOIB_HDRSIZE sizeof (struct ipoib_header) 450 451 /* 452 * IETF defined IPoIB link address; IBA QPN, followed by GID, 453 * which has a prefix and suffix, as reported via ARP. 454 */ 455 typedef struct ipoib_mac { 456 uint32_t ipoib_qpn; 457 uint32_t ipoib_gidpref[2]; 458 uint32_t ipoib_gidsuff[2]; 459 } ipoib_mac_t; 460 461 #define IPOIB_ADDRL sizeof (struct ipoib_mac) 462 463 /* 464 * Pseudo header prepended to datagram in DLIOCRAW transmit path 465 * and when GLD hands the datagram to the gldm_send entry point. 466 */ 467 typedef struct ipoib_ptxhdr { 468 ipoib_mac_t ipoib_dest; 469 ipoib_hdr_t ipoib_rhdr; 470 } ipoib_ptxhdr_t; 471 472 #define IPOIBDLSAP(p, offset) ((ipoib_ptxhdr_t *)((caddr_t)(p)+offset)) 473 474 /* 475 * The pseudo-GRH structure that sits before the data in the 476 * receive buffer, and is overlaid on top of the real GRH. 477 * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH 478 * does not hold valid information. If it is indicated valid, 479 * the driver must additionally provide the sender's qpn in 480 * network byte order in ipoib_sqpn, and not touch the 481 * remaining parts which were DMA'ed in by the IBA hardware. 482 */ 483 typedef struct ipoib_pgrh { 484 uint32_t ipoib_vertcflow; 485 uint32_t ipoib_sqpn; 486 uint32_t ipoib_sgid_pref[2]; 487 uint32_t ipoib_sgid_suff[2]; 488 uint32_t ipoib_dgid_pref[2]; 489 uint32_t ipoib_dgid_suff[2]; 490 } ipoib_pgrh_t; 491 492 /* 493 * The GRH is also dma'ed into recv buffers, thus space needs 494 * to be allocated for them. 495 */ 496 #define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t) 497 498 /* support the RC (reliable connected) mode */ 499 #define IBD_MAC_ADDR_RC 0x80000000 500 /* support the UC (unreliable connected) mode */ 501 #define IBD_MAC_ADDR_UC 0x40000000 502 503 #define IBD_RC_SERVICE_ID 0x100000000000000ULL 504 505 /* 506 * Legacy OFED had used a wrong service ID (one additional zero digit) for 507 * many years. To interop with legacy OFED, we support this wrong service ID 508 * here. 509 */ 510 #define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL 511 512 #define IBD_RC_MIN_CQ_SIZE 0x7f 513 514 /* Number of ibt_wc_t provided for each RC channel */ 515 #define IBD_RC_MAX_CQ_WC 0x3f 516 517 #if defined(_KERNEL) && !defined(_BOOT) 518 519 #include <sys/ib/ibtl/ibti.h> 520 #include <sys/ib/ib_pkt_hdrs.h> 521 #include <sys/list.h> 522 #include <sys/mac_provider.h> 523 #include <sys/mac_ib.h> 524 #include <sys/modhash.h> 525 526 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */ 527 typedef enum { 528 IBD_RC_STATE_INIT = 0, 529 530 /* Active side */ 531 IBD_RC_STATE_ACT_REP_RECV, /* reply received */ 532 IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */ 533 IBD_RC_STATE_ACT_REJECT, /* rejected */ 534 /* Someone else is closing this channel, please don't re-close it */ 535 IBD_RC_STATE_ACT_CLOSING, 536 IBD_RC_STATE_ACT_CLOSED, 537 IBD_RC_STATE_ACT_ERROR, 538 539 /* Passive side */ 540 IBD_RC_STATE_PAS_REQ_RECV, /* request received */ 541 IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */ 542 IBD_RC_STATE_PAS_REJECT, /* rejected */ 543 544 IBD_RC_STATE_PAS_CLOSED 545 } ibd_rc_chan_state_t; 546 547 /* 548 * Structure to encapsulate various types of async requests. 549 */ 550 typedef struct ibd_acache_rq { 551 struct list_node rq_list; /* list of pending work */ 552 int rq_op; /* what operation */ 553 ipoib_mac_t rq_mac; 554 ib_gid_t rq_gid; 555 void *rq_ptr; 556 void *rq_ptr2; 557 } ibd_req_t; 558 559 typedef struct ibd_mcache { 560 struct list_node mc_list; /* full/non list */ 561 uint8_t mc_jstate; 562 boolean_t mc_fullreap; 563 ibt_mcg_info_t mc_info; 564 ibd_req_t mc_req; /* to queue LEAVE req */ 565 } ibd_mce_t; 566 567 typedef struct ibd_acache_s { 568 struct list_node ac_list; /* free/active list */ 569 ibt_ud_dest_hdl_t ac_dest; 570 ipoib_mac_t ac_mac; 571 uint32_t ac_ref; 572 ibd_mce_t *ac_mce; /* for MCG AHs */ 573 574 /* For Reliable Connected mode */ 575 struct ibd_rc_chan_s *ac_chan; 576 /* protect tx_too_big_ongoing */ 577 kmutex_t tx_too_big_mutex; 578 /* Deal with too big packet */ 579 boolean_t tx_too_big_ongoing; 580 } ibd_ace_t; 581 582 #define IBD_MAX_SQSEG 59 583 #define IBD_MAX_RQSEG 1 584 585 typedef enum { 586 IBD_WQE_SEND, 587 IBD_WQE_RECV 588 } ibd_wqe_type_t; 589 590 typedef enum { 591 IBD_WQE_TXBUF = 1, 592 IBD_WQE_LSOBUF = 2, 593 IBD_WQE_MAPPED = 3, 594 IBD_WQE_RC_COPYBUF = 4 595 } ibd_wqe_buftype_t; 596 597 #ifdef DEBUG 598 typedef struct ibd_rc_stat_s { 599 kstat_named_t rc_rcv_trans_byte; 600 kstat_named_t rc_rcv_trans_pkt; 601 kstat_named_t rc_rcv_copy_byte; 602 kstat_named_t rc_rcv_copy_pkt; 603 kstat_named_t rc_rcv_alloc_fail; 604 605 kstat_named_t rc_rcq_err; /* fail in rcq handler */ 606 607 kstat_named_t rc_rwqe_short; /* short rwqe */ 608 609 kstat_named_t rc_xmt_bytes; 610 /* pkt size <= state->id_rc_tx_copy_thresh */ 611 kstat_named_t rc_xmt_small_pkt; 612 kstat_named_t rc_xmt_fragmented_pkt; 613 /* fail in ibt_map_mem_iov() */ 614 kstat_named_t rc_xmt_map_fail_pkt; 615 /* succ in ibt_map_mem_iov() */ 616 kstat_named_t rc_xmt_map_succ_pkt; 617 618 kstat_named_t rc_ace_not_found; /* ace not found */ 619 /* no swqe even after recycle */ 620 kstat_named_t rc_scq_no_swqe; 621 /* no tx large buf even after recycle */ 622 kstat_named_t rc_scq_no_largebuf; 623 624 /* short swqe in ibd_send() */ 625 kstat_named_t rc_swqe_short; 626 /* call mac_tx_update() when there is enough swqe */ 627 kstat_named_t rc_swqe_mac_update; 628 /* short large buf in ibd_send() */ 629 kstat_named_t rc_xmt_buf_short; 630 /* call mac_tx_update() when there is enough Tx large buffers */ 631 kstat_named_t rc_xmt_buf_mac_update; 632 633 kstat_named_t rc_conn_succ; /* # of success connect */ 634 kstat_named_t rc_conn_fail; /* # of fail connect */ 635 /* ace->ac_chan == NULL for unicast packet */ 636 kstat_named_t rc_null_conn; 637 /* not in active established state */ 638 kstat_named_t rc_no_estab_conn; 639 640 kstat_named_t rc_act_close; /* call ibd_rc_act_close() */ 641 kstat_named_t rc_pas_close; /* call ibd_rc_pas_close() */ 642 kstat_named_t rc_delay_ace_recycle; 643 kstat_named_t rc_act_close_simultaneous; 644 645 kstat_named_t rc_reset_cnt; /* # of Reset RC channel */ 646 kstat_named_t rc_timeout_act; 647 kstat_named_t rc_timeout_pas; 648 } ibd_rc_stat_t; 649 #endif 650 651 typedef struct ibd_rc_chan_list_s { 652 /* This mutex protects chan_list and ibd_rc_chan_t.next */ 653 kmutex_t chan_list_mutex; 654 struct ibd_rc_chan_s *chan_list; 655 } ibd_rc_chan_list_t; 656 657 typedef struct ibd_rc_tx_largebuf_s { 658 struct ibd_rc_tx_largebuf_s *lb_next; 659 uint8_t *lb_buf; 660 } ibd_rc_tx_largebuf_t; 661 662 /* 663 * Pre-registered copybuf used for send and receive 664 */ 665 typedef struct ibd_copybuf_s { 666 ibt_wr_ds_t ic_sgl; 667 uint8_t *ic_bufaddr; 668 } ibd_copybuf_t; 669 670 typedef struct ibd_wqe_s { 671 struct ibd_wqe_s *w_next; 672 ibd_copybuf_t w_copybuf; 673 mblk_t *im_mblk; 674 } ibd_wqe_t; 675 676 /* 677 * Send WQE 678 */ 679 typedef struct ibd_swqe_s { 680 ibd_wqe_t w_ibd_swqe; 681 ibd_wqe_buftype_t w_buftype; 682 ibt_send_wr_t w_swr; 683 ibd_ace_t *w_ahandle; 684 ibt_mi_hdl_t w_mi_hdl; 685 ibt_wr_ds_t w_sgl[IBD_MAX_SQSEG]; 686 ibd_rc_tx_largebuf_t *w_rc_tx_largebuf; 687 } ibd_swqe_t; 688 689 #define swqe_next w_ibd_swqe.w_next 690 #define swqe_copybuf w_ibd_swqe.w_copybuf 691 #define swqe_im_mblk w_ibd_swqe.im_mblk 692 #define SWQE_TO_WQE(swqe) (ibd_wqe_t *)&((swqe)->w_ibd_swqe) 693 #define WQE_TO_SWQE(wqe) (ibd_swqe_t *)wqe 694 695 /* 696 * Receive WQE 697 */ 698 typedef struct ibd_rwqe_s { 699 ibd_wqe_t w_ibd_rwqe; 700 struct ibd_state_s *w_state; 701 ibt_recv_wr_t w_rwr; 702 frtn_t w_freemsg_cb; 703 boolean_t w_freeing_wqe; 704 struct ibd_rc_chan_s *w_chan; 705 } ibd_rwqe_t; 706 707 #define rwqe_next w_ibd_rwqe.w_next 708 #define rwqe_copybuf w_ibd_rwqe.w_copybuf 709 #define rwqe_im_mblk w_ibd_rwqe.im_mblk 710 #define RWQE_TO_WQE(rwqe) (ibd_wqe_t *)&((rwqe)->w_ibd_rwqe) 711 #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe 712 713 typedef struct ibd_list_s { 714 kmutex_t dl_mutex; 715 ibd_wqe_t *dl_head; 716 union { 717 boolean_t pending_sends; 718 uint32_t bufs_outstanding; 719 } ustat; 720 uint32_t dl_cnt; 721 } ibd_list_t; 722 723 #define dl_pending_sends ustat.pending_sends 724 #define dl_bufs_outstanding ustat.bufs_outstanding 725 726 /* 727 * LSO buffers 728 * 729 * Under normal circumstances we should never need to use any buffer 730 * that's larger than MTU. Unfortunately, IB HCA has limitations 731 * on the length of SGL that are much smaller than those for regular 732 * ethernet NICs. Since the network layer doesn't care to limit the 733 * number of mblk fragments in any send mp chain, we end up having to 734 * use these larger-than-MTU sized (larger than id_tx_buf_sz actually) 735 * buffers occasionally. 736 */ 737 typedef struct ibd_lsobuf_s { 738 struct ibd_lsobuf_s *lb_next; 739 uint8_t *lb_buf; 740 int lb_isfree; 741 } ibd_lsobuf_t; 742 743 typedef struct ibd_lsobkt_s { 744 uint8_t *bkt_mem; 745 ibd_lsobuf_t *bkt_bufl; 746 ibd_lsobuf_t *bkt_free_head; 747 ibt_mr_hdl_t bkt_mr_hdl; 748 ibt_mr_desc_t bkt_mr_desc; 749 uint_t bkt_nelem; 750 uint_t bkt_nfree; 751 } ibd_lsobkt_t; 752 753 #define IBD_PORT_DRIVER 0x1 754 #define IBD_PARTITION_OBJ 0x2 755 756 /* 757 * Posting to a single software rx post queue is contentious, 758 * so break it out to (multiple) an array of queues. 759 * 760 * Try to ensure rx_queue structs fall in different cache lines using a filler. 761 * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes. 762 */ 763 #define RX_QUEUE_CACHE_LINE \ 764 (64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t))) 765 typedef struct ibd_rx_queue_s { 766 kmutex_t rx_post_lock; 767 ibd_wqe_t *rx_head; 768 uint_t rx_cnt; 769 uint8_t rx_pad[RX_QUEUE_CACHE_LINE]; 770 } ibd_rx_queue_t; 771 772 /* 773 * This structure maintains information per port per HCA 774 * (per network interface). 775 */ 776 typedef struct ibd_state_s { 777 uint_t id_type; 778 dev_info_t *id_dip; 779 ibt_clnt_hdl_t id_ibt_hdl; 780 ibt_hca_hdl_t id_hca_hdl; 781 ibt_pd_hdl_t id_pd_hdl; 782 kmem_cache_t *id_req_kmc; 783 784 ibd_list_t id_tx_rel_list; 785 786 uint32_t id_running; 787 788 uint32_t id_max_sqseg; 789 uint32_t id_max_sqseg_hiwm; 790 ibd_list_t id_tx_list; 791 ddi_softintr_t id_tx; 792 uint32_t id_tx_sends; 793 794 kmutex_t id_txpost_lock; 795 ibd_swqe_t *id_tx_head; 796 ibd_swqe_t *id_tx_tail; 797 int id_tx_busy; 798 799 uint_t id_tx_buf_sz; 800 uint8_t *id_tx_bufs; 801 ibd_swqe_t *id_tx_wqes; 802 ibt_mr_hdl_t id_tx_mr_hdl; 803 ibt_mr_desc_t id_tx_mr_desc; 804 805 kmutex_t id_lso_lock; 806 ibd_lsobkt_t *id_lso; 807 808 kmutex_t id_scq_poll_lock; 809 int id_scq_poll_busy; 810 811 ibt_cq_hdl_t id_scq_hdl; 812 ibt_wc_t *id_txwcs; 813 uint32_t id_txwcs_size; 814 815 int id_rx_nqueues; 816 ibd_rx_queue_t *id_rx_queues; 817 int id_rx_post_queue_index; 818 uint32_t id_rx_post_active; 819 820 ibd_rwqe_t *id_rx_wqes; 821 uint8_t *id_rx_bufs; 822 ibt_mr_hdl_t id_rx_mr_hdl; 823 ibt_mr_desc_t id_rx_mr_desc; 824 uint_t id_rx_buf_sz; 825 /* 826 * id_ud_num_rwqe 827 * Number of "receive WQE" elements that will be allocated and used 828 * by ibd. This parameter is limited by the maximum channel size of 829 * the HCA. Each buffer in the receive wqe will be of MTU size. 830 */ 831 uint32_t id_ud_num_rwqe; 832 ibd_list_t id_rx_list; 833 ddi_softintr_t id_rx; 834 uint32_t id_rx_bufs_outstanding_limit; 835 uint32_t id_rx_allocb; 836 uint32_t id_rx_allocb_failed; 837 ibd_list_t id_rx_free_list; 838 839 kmutex_t id_rcq_poll_lock; 840 int id_rcq_poll_busy; 841 uint32_t id_rxwcs_size; 842 ibt_wc_t *id_rxwcs; 843 ibt_cq_hdl_t id_rcq_hdl; 844 845 ibt_channel_hdl_t id_chnl_hdl; 846 ib_pkey_t id_pkey; 847 uint16_t id_pkix; 848 uint8_t id_port; 849 ibt_mcg_info_t *id_mcinfo; 850 851 mac_handle_t id_mh; 852 mac_resource_handle_t id_rh; 853 ib_gid_t id_sgid; 854 ib_qpn_t id_qpnum; 855 ipoib_mac_t id_macaddr; 856 ib_gid_t id_mgid; 857 ipoib_mac_t id_bcaddr; 858 859 int id_mtu; 860 uchar_t id_scope; 861 862 kmutex_t id_acache_req_lock; 863 kcondvar_t id_acache_req_cv; 864 struct list id_req_list; 865 kt_did_t id_async_thrid; 866 867 kmutex_t id_ac_mutex; 868 ibd_ace_t *id_ac_hot_ace; 869 struct list id_ah_active; 870 struct list id_ah_free; 871 ipoib_mac_t id_ah_addr; 872 ibd_req_t id_ah_req; 873 char id_ah_op; 874 uint64_t id_ah_error; 875 ibd_ace_t *id_ac_list; 876 mod_hash_t *id_ah_active_hash; 877 878 kmutex_t id_mc_mutex; 879 struct list id_mc_full; 880 struct list id_mc_non; 881 882 kmutex_t id_trap_lock; 883 kcondvar_t id_trap_cv; 884 boolean_t id_trap_stop; 885 uint32_t id_trap_inprog; 886 887 char id_prom_op; 888 889 kmutex_t id_sched_lock; 890 int id_sched_needed; 891 int id_sched_cnt; 892 int id_sched_lso_cnt; 893 894 kmutex_t id_link_mutex; 895 link_state_t id_link_state; 896 uint64_t id_link_speed; 897 898 uint64_t id_num_intrs; 899 uint64_t id_tx_short; 900 /* 901 * id_ud_num_swqe 902 * Number of "send WQE" elements that will be allocated and used by 903 * ibd. When tuning this parameter, the size of pre-allocated, pre- 904 * mapped copy buffer in each of these send wqes must be taken into 905 * account. This copy buffer size is determined by the value of 906 * IBD_TX_BUF_SZ (this is currently set to the same value of 907 * ibd_tx_copy_thresh, but may be changed independently if needed). 908 */ 909 uint32_t id_ud_num_swqe; 910 911 uint64_t id_xmt_bytes; 912 uint64_t id_rcv_bytes; 913 uint64_t id_multi_xmt; 914 uint64_t id_brd_xmt; 915 uint64_t id_multi_rcv; 916 uint64_t id_brd_rcv; 917 uint64_t id_xmt_pkt; 918 uint64_t id_rcv_pkt; 919 920 uint32_t id_hwcksum_capab; 921 boolean_t id_lso_policy; 922 boolean_t id_lso_capable; 923 uint_t id_lso_maxlen; 924 int id_hca_res_lkey_capab; 925 ibt_lkey_t id_res_lkey; 926 927 boolean_t id_bgroup_created; 928 kmutex_t id_macst_lock; 929 kcondvar_t id_macst_cv; 930 uint32_t id_mac_state; 931 932 /* For Reliable Connected Mode */ 933 boolean_t id_enable_rc; 934 boolean_t rc_enable_srq; 935 936 int rc_mtu; 937 uint32_t rc_tx_max_sqseg; 938 /* 939 * In IPoIB over Reliable Connected mode, its mac address is added 940 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function 941 * ibd_process_rx(), the input mac address should not include the 942 * "IBD_MAC_ADDR_RC" prefix. 943 * 944 * So, we introduce the rc_macaddr_loopback for the loopback filter in 945 * IPoIB over Reliable Connected mode. 946 * 947 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix. 948 */ 949 ipoib_mac_t rc_macaddr_loopback; 950 951 ibt_srv_hdl_t rc_listen_hdl; 952 ibt_sbind_hdl_t rc_listen_bind; 953 ibt_srv_hdl_t rc_listen_hdl_OFED_interop; 954 ibt_sbind_hdl_t rc_listen_bind_OFED_interop; 955 956 ibd_rc_chan_list_t rc_pass_chan_list; 957 /* obsolete active channel list */ 958 ibd_rc_chan_list_t rc_obs_act_chan_list; 959 960 kmutex_t rc_ace_recycle_lock; 961 ibd_ace_t *rc_ace_recycle; 962 963 /* Send */ 964 /* 965 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree 966 * and ibd_rc_tx_largebuf_t->lb_next 967 */ 968 kmutex_t rc_tx_large_bufs_lock; 969 ibd_rc_tx_largebuf_t *rc_tx_largebuf_free_head; 970 uint_t rc_tx_largebuf_nfree; 971 /* The chunk of whole Tx large buffers */ 972 uint8_t *rc_tx_mr_bufs; 973 ibt_mr_hdl_t rc_tx_mr_hdl; 974 ibt_mr_desc_t rc_tx_mr_desc; 975 ibd_rc_tx_largebuf_t *rc_tx_largebuf_desc_base; /* base addr */ 976 977 boolean_t rc_enable_iov_map; 978 uint_t rc_max_sqseg_hiwm; 979 980 /* For SRQ */ 981 uint32_t rc_srq_size; 982 ibt_srq_hdl_t rc_srq_hdl; 983 ibd_list_t rc_srq_rwqe_list; 984 ibd_list_t rc_srq_free_list; 985 ibd_rwqe_t *rc_srq_rwqes; 986 uint8_t *rc_srq_rx_bufs; 987 ibt_mr_hdl_t rc_srq_rx_mr_hdl; 988 ibt_mr_desc_t rc_srq_rx_mr_desc; 989 990 /* For chained receive */ 991 kmutex_t rc_rx_lock; 992 mblk_t *rc_rx_mp; 993 mblk_t *rc_rx_mp_tail; 994 uint32_t rc_rx_mp_len; 995 996 uint32_t rc_num_tx_chan; 997 uint32_t rc_num_rx_chan; 998 999 /* Protect rc_timeout_start and rc_timeout */ 1000 kmutex_t rc_timeout_lock; 1001 boolean_t rc_timeout_start; 1002 timeout_id_t rc_timeout; 1003 1004 /* Counters for RC mode */ 1005 /* RX */ 1006 /* 1007 * # of Received packets. These packets are directly transferred to GLD 1008 * without copy it 1009 */ 1010 uint64_t rc_rcv_trans_byte; 1011 uint64_t rc_rcv_trans_pkt; 1012 /* 1013 * # of Received packets. We will allocate new buffers for these packet, 1014 * copy their content into new buffers, then transfer to GLD 1015 */ 1016 uint64_t rc_rcv_copy_byte; 1017 uint64_t rc_rcv_copy_pkt; 1018 uint64_t rc_rcv_alloc_fail; 1019 1020 #ifdef DEBUG 1021 uint64_t rc_rwqe_short; /* short rwqe */ 1022 #endif 1023 1024 /* wc->wc_status != IBT_WC_SUCCESS */ 1025 uint64_t rc_rcq_err; 1026 1027 /* Tx */ 1028 uint64_t rc_xmt_bytes; 1029 1030 /* pkt size <= ibd_rc_tx_copy_thresh */ 1031 uint64_t rc_xmt_small_pkt; 1032 uint64_t rc_xmt_fragmented_pkt; 1033 /* fail in ibt_map_mem_iov() */ 1034 uint64_t rc_xmt_map_fail_pkt; 1035 /* succ in ibt_map_mem_iov() */ 1036 uint64_t rc_xmt_map_succ_pkt; 1037 1038 uint64_t rc_ace_not_found; 1039 1040 uint64_t rc_xmt_drop_too_long_pkt; 1041 uint64_t rc_xmt_icmp_too_long_pkt; 1042 uint64_t rc_xmt_reenter_too_long_pkt; 1043 1044 /* short swqe in ibd_send() */ 1045 uint64_t rc_swqe_short; 1046 /* call mac_tx_update when there is enough swqe */ 1047 uint64_t rc_swqe_mac_update; 1048 /* short tx large copy buf in ibd_send() */ 1049 uint64_t rc_xmt_buf_short; 1050 /* call mac_tx_update when there is enough Tx copy buf */ 1051 uint64_t rc_xmt_buf_mac_update; 1052 1053 /* No swqe even after call swqe recycle function */ 1054 uint64_t rc_scq_no_swqe; 1055 /* No large Tx buf even after call swqe recycle function */ 1056 uint64_t rc_scq_no_largebuf; 1057 1058 /* Connection setup and close */ 1059 uint64_t rc_conn_succ; /* time of succ connect */ 1060 uint64_t rc_conn_fail; /* time of fail connect */ 1061 /* ace->ac_chan == NULL for unicast packet */ 1062 uint64_t rc_null_conn; 1063 /* not in active established state */ 1064 uint64_t rc_no_estab_conn; 1065 1066 uint64_t rc_act_close; /* call ibd_rc_act_close() */ 1067 uint64_t rc_pas_close; /* call ibd_rc_pas_close() */ 1068 uint64_t rc_delay_ace_recycle; 1069 uint64_t rc_act_close_simultaneous; 1070 /* Fail to close a channel because someone else is still using it */ 1071 uint64_t rc_act_close_not_clean; 1072 /* RCQ is being invoked when closing RC channel */ 1073 uint64_t rc_pas_close_rcq_invoking; 1074 1075 /* the counter of reset RC channel */ 1076 uint64_t rc_reset_cnt; 1077 1078 uint64_t rc_timeout_act; 1079 uint64_t rc_timeout_pas; 1080 1081 /* 1082 * Fail to stop this port because this port is connecting to a remote 1083 * port 1084 */ 1085 uint64_t rc_stop_connect; 1086 1087 #ifdef DEBUG 1088 kstat_t *rc_ksp; 1089 #endif 1090 ib_guid_t id_hca_guid; 1091 ib_guid_t id_port_guid; 1092 datalink_id_t id_dlinkid; 1093 datalink_id_t id_plinkid; 1094 int id_port_inst; 1095 struct ibd_state_s *id_next; 1096 boolean_t id_force_create; 1097 boolean_t id_bgroup_present; 1098 uint_t id_hca_max_chan_sz; 1099 1100 /* 1101 * UD Mode Tunables 1102 * 1103 * id_ud_tx_copy_thresh 1104 * This sets the threshold at which ibd will attempt to do a bcopy 1105 * of the outgoing data into a pre-mapped buffer. IPoIB driver's 1106 * send behavior is restricted by various parameters, so setting of 1107 * this value must be made after careful considerations only. For 1108 * instance, IB HCAs currently impose a relatively small limit 1109 * (when compared to ethernet NICs) on the length of the SGL for 1110 * transmit. On the other hand, the ip stack could send down mp 1111 * chains that are quite long when LSO is enabled. 1112 * 1113 * id_num_lso_bufs 1114 * Number of "larger-than-MTU" copy buffers to use for cases when the 1115 * outgoing mblk chain is too fragmented to be used with 1116 * ibt_map_mem_iov() and too large to be used with regular MTU-sized 1117 * copy buffers. It is not recommended to tune this variable without 1118 * understanding the application environment and/or memory resources. 1119 * The size of each of these lso buffers is determined by the value of 1120 * IBD_LSO_BUFSZ. 1121 * 1122 * id_num_ah 1123 * Number of AH cache entries to allocate 1124 * 1125 * id_hash_size 1126 * Hash table size for the active AH list 1127 * 1128 */ 1129 uint_t id_ud_tx_copy_thresh; 1130 uint_t id_num_lso_bufs; 1131 uint_t id_num_ah; 1132 uint_t id_hash_size; 1133 1134 boolean_t id_create_broadcast_group; 1135 1136 boolean_t id_allow_coalesce_comp_tuning; 1137 uint_t id_ud_rx_comp_count; 1138 uint_t id_ud_rx_comp_usec; 1139 uint_t id_ud_tx_comp_count; 1140 uint_t id_ud_tx_comp_usec; 1141 1142 /* RC Mode Tunables */ 1143 1144 uint_t id_rc_rx_comp_count; 1145 uint_t id_rc_rx_comp_usec; 1146 uint_t id_rc_tx_comp_count; 1147 uint_t id_rc_tx_comp_usec; 1148 /* 1149 * id_rc_tx_copy_thresh 1150 * This sets the threshold at which ibd will attempt to do a bcopy 1151 * of the outgoing data into a pre-mapped buffer. 1152 * 1153 * id_rc_rx_copy_thresh 1154 * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd 1155 * will attempt to allocate a buffer and do a bcopy of the incoming 1156 * data into the allocated buffer. 1157 * 1158 * id_rc_rx_rwqe_thresh 1159 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd 1160 * will attempt to allocate a buffer and do a bcopy of the incoming 1161 * data into the allocated buffer. 1162 * 1163 * id_rc_num_swqe 1164 * 1) Send CQ size = ibd_rc_num_swqe 1165 * 2) The send queue size = ibd_rc_num_swqe -1 1166 * 3) Number of pre-allocated Tx buffers for ibt_post_send() = 1167 * ibd_rc_num_swqe - 1. 1168 * 1169 * id_rc_num_rwqe 1170 * 1) For non-SRQ, we pre-post id_rc_num_rwqe number of WRs 1171 * via ibt_post_receive() for receive queue of each RC channel. 1172 * 2) For SRQ and non-SRQ, receive CQ size = id_rc_num_rwqe 1173 * 1174 * For SRQ 1175 * If using SRQ, we allocate id_rc_num_srq number of buffers (the 1176 * size of each buffer is equal to RC mtu). And post them by 1177 * ibt_post_srq(). 1178 * 1179 * id_rc_num_srq 1180 * id_rc_num_srq should not be larger than id_rc_num_rwqe, 1181 * otherwise it will cause a bug with the following warnings: 1182 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error 1183 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic 1184 * error 1185 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 1186 * catastrophic channel error 1187 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 1188 * completion queue error 1189 */ 1190 uint_t id_rc_tx_copy_thresh; 1191 uint_t id_rc_rx_copy_thresh; 1192 uint_t id_rc_rx_rwqe_thresh; 1193 uint_t id_rc_num_swqe; 1194 uint_t id_rc_num_rwqe; 1195 uint_t id_rc_num_srq; 1196 } ibd_state_t; 1197 1198 /* 1199 * Structures to track global IBTF data, data that is shared 1200 * among the IBD device instances. This includes the one ibt_hdl 1201 * and the list of service registrations. 1202 */ 1203 typedef struct ibd_service_s { 1204 struct ibd_service_s *is_link; 1205 ibt_srv_hdl_t is_srv_hdl; 1206 ib_svc_id_t is_sid; 1207 uint_t is_ref_cnt; 1208 } ibd_service_t; 1209 1210 typedef struct ibd_global_state_s { 1211 kmutex_t ig_mutex; 1212 ibt_clnt_hdl_t ig_ibt_hdl; 1213 uint_t ig_ibt_hdl_ref_cnt; 1214 ibd_service_t *ig_service_list; 1215 } ibd_global_state_t; 1216 1217 typedef struct ibd_rc_msg_hello_s { 1218 uint32_t reserved_qpn; 1219 uint32_t rx_mtu; 1220 } ibd_rc_msg_hello_t; 1221 1222 typedef struct ibd_rc_chan_s { 1223 struct ibd_rc_chan_s *next; 1224 /* channel hdl that we'll be using for Reliable Connected Mode */ 1225 ibt_channel_hdl_t chan_hdl; 1226 struct ibd_state_s *state; 1227 ibd_ace_t *ace; 1228 ibd_rc_chan_state_t chan_state; 1229 1230 ibd_list_t tx_wqe_list; /* free wqe list */ 1231 ibd_list_t tx_rel_list; /* for swqe recycle */ 1232 1233 ibd_swqe_t *tx_wqes; 1234 1235 /* start address of Tx Buffers */ 1236 uint8_t *tx_mr_bufs; 1237 ibt_mr_hdl_t tx_mr_hdl; 1238 ibt_mr_desc_t tx_mr_desc; 1239 1240 ibt_cq_hdl_t scq_hdl; /* Tx completion queue */ 1241 ibt_wc_t tx_wc[IBD_RC_MAX_CQ_WC]; 1242 ddi_softintr_t scq_softintr; 1243 1244 /* For chained send */ 1245 kmutex_t tx_post_lock; 1246 ibd_swqe_t *tx_head; 1247 ibd_swqe_t *tx_tail; 1248 int tx_busy; 1249 1250 /* For tx buffer recycle */ 1251 kmutex_t tx_poll_lock; 1252 int tx_poll_busy; 1253 1254 /* Rx */ 1255 ibd_list_t rx_wqe_list; /* used by ibt_post_recv */ 1256 ibd_list_t rx_free_list; /* free rwqe list */ 1257 1258 ibt_cq_hdl_t rcq_hdl; /* Rx completion queue */ 1259 ibt_wc_t rx_wc[IBD_RC_MAX_CQ_WC]; 1260 1261 ibd_rwqe_t *rx_rwqes; /* the chuck of whole rwqes */ 1262 uint8_t *rx_bufs; /* the chuck of whole Rx bufs */ 1263 ibt_mr_hdl_t rx_mr_hdl; /* ibt_mr_hdl_t for rx_bufs */ 1264 ibt_mr_desc_t rx_mr_desc; /* ibt_mr_desc_t for rx_bufs */ 1265 1266 /* For chained receive */ 1267 kmutex_t rx_lock; 1268 mblk_t *rx_mp; 1269 mblk_t *rx_mp_tail; 1270 uint32_t rx_mp_len; 1271 1272 uint32_t rcq_size; 1273 uint32_t scq_size; 1274 /* 1275 * We need two channels for each connection. 1276 * One channel for Tx; another channel for Rx. 1277 * If "is_tx_chan == B_TRUE", this is a Tx channel. 1278 */ 1279 boolean_t is_tx_chan; 1280 1281 /* 1282 * For the connection reaper routine ibd_rc_conn_timeout_call(). 1283 * "is_used == B_FALSE" indicates this RC channel has not been used for 1284 * a long (=ibd_rc_conn_timeout) time. 1285 */ 1286 boolean_t is_used; 1287 /* 1288 * When closing this channel, we need to make sure 1289 * "chan->rcq_invoking == 0". 1290 */ 1291 uint32_t rcq_invoking; 1292 } ibd_rc_chan_t; 1293 1294 /* 1295 * The following functions are defined in "ibd.c". 1296 * They are also used by "ibd_cm.c" 1297 */ 1298 void ibd_print_warn(ibd_state_t *, char *, ...); 1299 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 1300 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 1301 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 1302 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 1303 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 1304 1305 /* 1306 * The following functions are defined in "ibd_cm.c". 1307 * They are also used in "ibd.c". 1308 */ 1309 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *); 1310 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *); 1311 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *); 1312 1313 /* Connection Setup/Close Functions */ 1314 ibt_status_t ibd_rc_listen(ibd_state_t *); 1315 void ibd_rc_stop_listen(ibd_state_t *); 1316 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *, 1317 uint64_t); 1318 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *); 1319 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *); 1320 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *); 1321 int ibd_rc_pas_close(ibd_rc_chan_t *, boolean_t, boolean_t); 1322 void ibd_rc_close_all_chan(ibd_state_t *); 1323 void ibd_rc_conn_timeout_call(void *carg); 1324 1325 /* Receive Functions */ 1326 int ibd_rc_init_srq_list(ibd_state_t *); 1327 void ibd_rc_fini_srq_list(ibd_state_t *); 1328 int ibd_rc_repost_srq_free_list(ibd_state_t *); 1329 1330 /* Send Functions */ 1331 int ibd_rc_init_tx_largebuf_list(ibd_state_t *); 1332 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *); 1333 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *); 1334 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *); 1335 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t); 1336 void ibd_rc_tx_cleanup(ibd_swqe_t *); 1337 1338 /* Others */ 1339 void ibd_rc_get_conf(ibd_state_t *); 1340 int ibd_rc_init_stats(ibd_state_t *); 1341 1342 #endif /* _KERNEL && !_BOOT */ 1343 1344 #ifdef __cplusplus 1345 } 1346 #endif 1347 1348 #endif /* _SYS_IB_CLIENTS_IBD_H */ 1349