1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _SYS_IB_CLIENTS_IBD_H 28 #define _SYS_IB_CLIENTS_IBD_H 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 34 /* The following macros are used in both ibd.c and ibd_cm.c */ 35 36 /* 37 * Completion queue polling control 38 */ 39 #define IBD_CQ_POLLING 0x1 40 #define IBD_REDO_CQ_POLLING 0x2 41 42 /* 43 * Maximum length for returning chained mps back to crossbow. 44 * Also used as the maximum number of rx wc's polled at a time. 45 */ 46 #define IBD_MAX_RX_MP_LEN 16 47 48 /* 49 * When doing multiple-send-wr, this value determines how many to do at 50 * a time (in a single ibt_post_send). 51 */ 52 #define IBD_MAX_TX_POST_MULTIPLE 4 53 54 /* 55 * Flag bits for resources to reap 56 */ 57 #define IBD_RSRC_SWQE 0x1 58 #define IBD_RSRC_LSOBUF 0x2 59 #define IBD_RSRC_RC_SWQE 0x4 60 #define IBD_RSRC_RC_TX_LARGEBUF 0x8 61 62 /* 63 * Async operation types 64 */ 65 #define IBD_ASYNC_GETAH 1 66 #define IBD_ASYNC_JOIN 2 67 #define IBD_ASYNC_LEAVE 3 68 #define IBD_ASYNC_PROMON 4 69 #define IBD_ASYNC_PROMOFF 5 70 #define IBD_ASYNC_REAP 6 71 #define IBD_ASYNC_TRAP 7 72 #define IBD_ASYNC_SCHED 8 73 #define IBD_ASYNC_LINK 9 74 #define IBD_ASYNC_EXIT 10 75 #define IBD_ASYNC_RC_TOO_BIG 11 76 #define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12 77 #define IBD_ASYNC_RC_RECYCLE_ACE 13 78 79 /* 80 * Miscellaneous constants 81 */ 82 #define IBD_SEND 0 83 #define IBD_RECV 1 84 85 /* 86 * Thresholds 87 * 88 * When waiting for resources (swqes or lso buffers) to become available, 89 * the first two thresholds below determine how long to wait before informing 90 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 91 * determines how low the available swqes should go before we start polling 92 * the completion queue. 93 */ 94 #define IBD_FREE_LSOS_THRESH 8 95 #define IBD_FREE_SWQES_THRESH 20 96 #define IBD_TX_POLL_THRESH 80 97 98 #ifdef DEBUG 99 void debug_print(int l, char *fmt, ...); 100 #define DPRINT debug_print 101 #else 102 #define DPRINT 0 && 103 #endif 104 105 /* 106 * AH and MCE active list manipulation: 107 * 108 * Multicast disable requests and MCG delete traps are two cases 109 * where the active AH entry for the mcg (if any unreferenced one exists) 110 * will be moved to the free list (to force the next Tx to the mcg to 111 * join the MCG in SendOnly mode). Port up handling will also move AHs 112 * from active to free list. 113 * 114 * In the case when some transmits are still pending on an entry 115 * for an mcg, but a multicast disable has already been issued on the 116 * mcg, there are some options to consider to preserve the join state 117 * to ensure the emitted packet is properly routed on the IBA fabric. 118 * For the AH, we can 119 * 1. take out of active list at multicast disable time. 120 * 2. take out of active list only when last pending Tx completes. 121 * For the MCE, we can 122 * 3. take out of active list at multicast disable time. 123 * 4. take out of active list only when last pending Tx completes. 124 * 5. move from active list to stale list at multicast disable time. 125 * We choose to use 2,4. We use option 4 so that if a multicast enable 126 * is tried before the pending Tx completes, the enable code finds the 127 * mce in the active list and just has to make sure it will not be reaped 128 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 129 * a stale list (#5) that would be checked in the enable code would need 130 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 131 * after the multicast disable would try to put an AH in the active list, 132 * and associate the mce it finds in the active list to this new AH, 133 * whereas the mce is already associated with the previous AH (taken off 134 * the active list), and will be removed once the pending Tx's complete 135 * (unless a reference count on mce's is implemented). One implication of 136 * using 2,4 is that new Tx's posted before the pending Tx's complete will 137 * grab new references on the AH, further delaying the leave. 138 * 139 * In the case of mcg delete (or create) trap when the port is sendonly 140 * joined, the AH and MCE handling is different: the AH and MCE has to be 141 * immediately taken off the active lists (forcing a join and path lookup 142 * at the next Tx is the only guaranteed means of ensuring a proper Tx 143 * to an mcg as it is repeatedly created and deleted and goes thru 144 * reincarnations). 145 * 146 * When a port is already sendonly joined, and a multicast enable is 147 * attempted, the same mce structure is promoted; this ensures only a 148 * single mce on the active list tracks the most powerful join state. 149 * 150 * In the case of port up event handling, the MCE for sendonly membership 151 * is freed up, and the ACE is put into the free list as soon as possible 152 * (depending on whether posted Tx's have completed). For fullmembership 153 * MCE's though, the ACE is similarly handled; but the MCE is kept around 154 * (a re-JOIN is attempted) only if the DLPI leave has not already been 155 * done; else the mce is deconstructed (mc_fullreap case). 156 * 157 * MCG creation and deletion trap handling: 158 * 159 * These traps are unreliable (meaning sometimes the trap might never 160 * be delivered to the subscribed nodes) and may arrive out-of-order 161 * since they use UD transport. An alternative to relying on these 162 * unreliable traps is to poll for mcg presence every so often, but 163 * instead of doing that, we try to be as conservative as possible 164 * while handling the traps, and hope that the traps do arrive at 165 * the subscribed nodes soon. Note that if a node is fullmember 166 * joined to an mcg, it can not possibly receive a mcg create/delete 167 * trap for that mcg (by fullmember definition); if it does, it is 168 * an old trap from a previous incarnation of the mcg. 169 * 170 * Whenever a trap is received, the driver cleans up its sendonly 171 * membership to the group; we choose to do a sendonly leave even 172 * on a creation trap to handle the case of a prior deletion of the mcg 173 * having gone unnoticed. Consider an example scenario: 174 * T1: MCG M is deleted, and fires off deletion trap D1. 175 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 176 * T3: Node N tries to transmit to M, joining in sendonly mode. 177 * T4: MCG M is deleted, and fires off deletion trap D2. 178 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 179 * If the trap is D2, then a LEAVE is not required, since the mcg 180 * is already deleted; but if it is D1, a LEAVE is required. A safe 181 * approach is to always LEAVE, but the SM may be confused if it 182 * receives a LEAVE without a prior JOIN. 183 * 184 * Management of the non-membership to an mcg is similar to the above, 185 * except that if the interface is in promiscuous mode, it is required 186 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 187 * if the re-join attempt fails (in which case a warning message needs 188 * to be printed), it is not clear whether it failed due to the mcg not 189 * existing, or some fabric/hca issues, due to the delayed nature of 190 * trap delivery. Querying the SA to establish presence/absence of the 191 * mcg is also racy at best. Thus, the driver just prints a warning 192 * message when it can not rejoin after receiving a create trap, although 193 * this might be (on rare occasions) a mis-warning if the create trap is 194 * received after the mcg was deleted. 195 */ 196 197 /* 198 * Implementation of atomic "recycle" bits and reference count 199 * on address handles. This utilizes the fact that max reference 200 * count on any handle is limited by number of send wqes, thus 201 * high bits in the ac_ref field can be used as the recycle bits, 202 * and only the low bits hold the number of pending Tx requests. 203 * This atomic AH reference counting allows the Tx completion 204 * handler not to acquire the id_ac_mutex to process every completion, 205 * thus reducing lock contention problems between completion and 206 * the Tx path. 207 */ 208 #define CYCLEVAL 0x80000 209 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 210 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 211 #define GET_REF(ace) ((ace)->ac_ref) 212 #define GET_REF_CYCLE(ace) ( \ 213 /* \ 214 * Make sure "cycle" bit is set. \ 215 */ \ 216 ASSERT(CYCLE_SET(ace)), \ 217 ((ace)->ac_ref & ~(CYCLEVAL)) \ 218 ) 219 #define INC_REF(ace, num) { \ 220 atomic_add_32(&(ace)->ac_ref, num); \ 221 } 222 #define SET_CYCLE_IF_REF(ace) ( \ 223 CYCLE_SET(ace) ? B_TRUE : \ 224 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 225 CYCLEVAL ? \ 226 /* \ 227 * Clear the "cycle" bit we just set; \ 228 * ref count known to be 0 from above. \ 229 */ \ 230 CLEAR_REFCYCLE(ace), B_FALSE : \ 231 /* \ 232 * We set "cycle" bit; let caller know. \ 233 */ \ 234 B_TRUE \ 235 ) 236 #define DEC_REF_DO_CYCLE(ace) ( \ 237 atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \ 238 /* \ 239 * Ref count known to be 0 from above. \ 240 */ \ 241 B_TRUE : \ 242 B_FALSE \ 243 ) 244 245 /* 246 * Address handle entries maintained by the driver are kept in the 247 * free and active lists. Each entry starts out in the free list; 248 * it migrates to the active list when primed using ibt_get_paths() 249 * and ibt_modify_ud_dest() for transmission to a specific destination. 250 * In the active list, the entry has a reference count indicating the 251 * number of ongoing/uncompleted transmits that reference it. The 252 * entry is left in the active list even after the reference count 253 * goes to 0, since successive transmits can find it there and do 254 * not need to set up another entry (ie the path information is 255 * cached using the active list). Entries on the active list are 256 * also hashed using the destination link address as a key for faster 257 * lookups during transmits. 258 * 259 * For any destination address (unicast or multicast, whatever the 260 * join states), there will be at most one entry in the active list. 261 * Entries with a 0 reference count on the active list can be reused 262 * for a transmit to a new destination, if the free list is empty. 263 * 264 * The AH free list insertion/deletion is protected with the id_ac_mutex, 265 * since the async thread and Tx callback handlers insert/delete. The 266 * active list does not need a lock (all operations are done by the 267 * async thread) but updates to the reference count are atomically 268 * done (increments done by Tx path, decrements by the Tx callback handler). 269 */ 270 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 271 list_insert_head(&state->id_ah_free, ce) 272 #define IBD_ACACHE_GET_FREE(state) \ 273 list_get_head(&state->id_ah_free) 274 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 275 int _ret_; \ 276 list_insert_head(&state->id_ah_active, ce); \ 277 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 278 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 279 ASSERT(_ret_ == 0); \ 280 state->id_ac_hot_ace = ce; \ 281 } 282 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 283 list_remove(&state->id_ah_active, ce); \ 284 if (state->id_ac_hot_ace == ce) \ 285 state->id_ac_hot_ace = NULL; \ 286 (void) mod_hash_remove(state->id_ah_active_hash, \ 287 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 288 } 289 #define IBD_ACACHE_GET_ACTIVE(state) \ 290 list_get_head(&state->id_ah_active) 291 292 /* 293 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 294 * front of optional src/tgt link layer address. Right now Solaris inserts 295 * padding by default at the end. The routine which is doing is nce_xmit() 296 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 297 * the packet comes down from IP layer to the IBD driver, it is in the 298 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 299 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 300 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 301 * 302 * The send routine at IBD driver changes this packet as follows: 303 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 304 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 305 * aligned. 306 * 307 * At the receiving side again ibd_process_rx takes the above packet and 308 * removes the two bytes of front padding and inserts it at the end. This 309 * is since the IP layer does not understand padding at the front. 310 */ 311 #define IBD_PAD_NSNA(ip6h, len, type) { \ 312 uchar_t *nd_lla_ptr; \ 313 icmp6_t *icmp6; \ 314 nd_opt_hdr_t *opt; \ 315 int i; \ 316 \ 317 icmp6 = (icmp6_t *)&ip6h[1]; \ 318 len -= sizeof (nd_neighbor_advert_t); \ 319 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 320 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 321 (len != 0)) { \ 322 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 323 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 324 ASSERT(opt != NULL); \ 325 nd_lla_ptr = (uchar_t *)&opt[1]; \ 326 if (type == IBD_SEND) { \ 327 for (i = IPOIB_ADDRL; i > 0; i--) \ 328 *(nd_lla_ptr + i + 1) = \ 329 *(nd_lla_ptr + i - 1); \ 330 } else { \ 331 for (i = 0; i < IPOIB_ADDRL; i++) \ 332 *(nd_lla_ptr + i) = \ 333 *(nd_lla_ptr + i + 2); \ 334 } \ 335 *(nd_lla_ptr + i) = 0; \ 336 *(nd_lla_ptr + i + 1) = 0; \ 337 } \ 338 } 339 340 341 /* 342 * IETF defined IPoIB encapsulation header, with 2b of ethertype 343 * followed by 2 reserved bytes. This is at the start of the 344 * datagram sent to and received over the wire by the driver. 345 */ 346 typedef struct ipoib_header { 347 ushort_t ipoib_type; 348 ushort_t ipoib_mbz; 349 } ipoib_hdr_t; 350 351 #define IPOIB_HDRSIZE sizeof (struct ipoib_header) 352 353 /* 354 * IETF defined IPoIB link address; IBA QPN, followed by GID, 355 * which has a prefix and suffix, as reported via ARP. 356 */ 357 typedef struct ipoib_mac { 358 uint32_t ipoib_qpn; 359 uint32_t ipoib_gidpref[2]; 360 uint32_t ipoib_gidsuff[2]; 361 } ipoib_mac_t; 362 363 #define IPOIB_ADDRL sizeof (struct ipoib_mac) 364 365 /* 366 * Pseudo header prepended to datagram in DLIOCRAW transmit path 367 * and when GLD hands the datagram to the gldm_send entry point. 368 */ 369 typedef struct ipoib_ptxhdr { 370 ipoib_mac_t ipoib_dest; 371 ipoib_hdr_t ipoib_rhdr; 372 } ipoib_ptxhdr_t; 373 374 #define IPOIBDLSAP(p, offset) ((ipoib_ptxhdr_t *)((caddr_t)(p)+offset)) 375 376 /* 377 * The pseudo-GRH structure that sits before the data in the 378 * receive buffer, and is overlaid on top of the real GRH. 379 * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH 380 * does not hold valid information. If it is indicated valid, 381 * the driver must additionally provide the sender's qpn in 382 * network byte order in ipoib_sqpn, and not touch the 383 * remaining parts which were DMA'ed in by the IBA hardware. 384 */ 385 typedef struct ipoib_pgrh { 386 uint32_t ipoib_vertcflow; 387 uint32_t ipoib_sqpn; 388 uint32_t ipoib_sgid_pref[2]; 389 uint32_t ipoib_sgid_suff[2]; 390 uint32_t ipoib_dgid_pref[2]; 391 uint32_t ipoib_dgid_suff[2]; 392 } ipoib_pgrh_t; 393 394 /* 395 * The GRH is also dma'ed into recv buffers, thus space needs 396 * to be allocated for them. 397 */ 398 #define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t) 399 400 /* support the RC (reliable connected) mode */ 401 #define IBD_MAC_ADDR_RC 0x80000000 402 /* support the UC (unreliable connected) mode */ 403 #define IBD_MAC_ADDR_UC 0x40000000 404 405 #define IBD_RC_SERVICE_ID 0x100000000000000ULL 406 407 /* 408 * Legacy OFED had used a wrong service ID (one additional zero digit) for 409 * many years. To interop with legacy OFED, we support this wrong service ID 410 * here. 411 */ 412 #define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL 413 414 #define IBD_RC_MIN_CQ_SIZE 0x7f 415 416 /* Number of ibt_wc_t provided for each RC channel */ 417 #define IBD_RC_MAX_CQ_WC 0x3f 418 419 #if defined(_KERNEL) && !defined(_BOOT) 420 421 #include <sys/ib/ibtl/ibti.h> 422 #include <sys/ib/ib_pkt_hdrs.h> 423 #include <sys/list.h> 424 #include <sys/mac_provider.h> 425 #include <sys/mac_ib.h> 426 #include <sys/modhash.h> 427 428 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */ 429 typedef enum { 430 IBD_RC_STATE_INIT = 0, 431 432 /* Active side */ 433 IBD_RC_STATE_ACT_REP_RECV, /* reply received */ 434 IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */ 435 IBD_RC_STATE_ACT_REJECT, /* rejected */ 436 /* Someone else is closing this channel, please don't re-close it */ 437 IBD_RC_STATE_ACT_CLOSING, 438 IBD_RC_STATE_ACT_CLOSED, 439 IBD_RC_STATE_ACT_ERROR, 440 441 /* Passive side */ 442 IBD_RC_STATE_PAS_REQ_RECV, /* request received */ 443 IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */ 444 IBD_RC_STATE_PAS_REJECT, /* rejected */ 445 446 IBD_RC_STATE_PAS_CLOSED 447 } ibd_rc_chan_state_t; 448 449 /* 450 * Structure to encapsulate various types of async requests. 451 */ 452 typedef struct ibd_acache_rq { 453 struct list_node rq_list; /* list of pending work */ 454 int rq_op; /* what operation */ 455 ipoib_mac_t rq_mac; 456 ib_gid_t rq_gid; 457 void *rq_ptr; 458 void *rq_ptr2; 459 } ibd_req_t; 460 461 typedef struct ibd_mcache { 462 struct list_node mc_list; /* full/non list */ 463 uint8_t mc_jstate; 464 boolean_t mc_fullreap; 465 ibt_mcg_info_t mc_info; 466 ibd_req_t mc_req; /* to queue LEAVE req */ 467 } ibd_mce_t; 468 469 typedef struct ibd_acache_s { 470 struct list_node ac_list; /* free/active list */ 471 ibt_ud_dest_hdl_t ac_dest; 472 ipoib_mac_t ac_mac; 473 uint32_t ac_ref; 474 ibd_mce_t *ac_mce; /* for MCG AHs */ 475 476 /* For Reliable Connected mode */ 477 struct ibd_rc_chan_s *ac_chan; 478 /* protect tx_too_big_ongoing */ 479 kmutex_t tx_too_big_mutex; 480 /* Deal with too big packet */ 481 boolean_t tx_too_big_ongoing; 482 } ibd_ace_t; 483 484 #define IBD_MAX_SQSEG 59 485 #define IBD_MAX_RQSEG 1 486 487 typedef enum { 488 IBD_WQE_SEND, 489 IBD_WQE_RECV 490 } ibd_wqe_type_t; 491 492 typedef enum { 493 IBD_WQE_TXBUF = 1, 494 IBD_WQE_LSOBUF = 2, 495 IBD_WQE_MAPPED = 3, 496 IBD_WQE_RC_COPYBUF = 4 497 } ibd_wqe_buftype_t; 498 499 #ifdef DEBUG 500 typedef struct ibd_rc_stat_s { 501 kstat_named_t rc_rcv_trans_byte; 502 kstat_named_t rc_rcv_trans_pkt; 503 kstat_named_t rc_rcv_copy_byte; 504 kstat_named_t rc_rcv_copy_pkt; 505 kstat_named_t rc_rcv_alloc_fail; 506 507 kstat_named_t rc_rcq_invoke; 508 kstat_named_t rc_rcq_err; /* fail in rcq handler */ 509 kstat_named_t rc_scq_invoke; 510 511 kstat_named_t rc_rwqe_short; /* short rwqe */ 512 513 kstat_named_t rc_xmt_bytes; 514 /* pkt size <= ibd_rc_tx_copy_thresh */ 515 kstat_named_t rc_xmt_small_pkt; 516 kstat_named_t rc_xmt_fragmented_pkt; 517 /* fail in ibt_map_mem_iov() */ 518 kstat_named_t rc_xmt_map_fail_pkt; 519 /* succ in ibt_map_mem_iov() */ 520 kstat_named_t rc_xmt_map_succ_pkt; 521 522 kstat_named_t rc_ace_not_found; /* ace not found */ 523 /* no swqe even after recycle */ 524 kstat_named_t rc_scq_no_swqe; 525 /* no tx large buf even after recycle */ 526 kstat_named_t rc_scq_no_largebuf; 527 528 /* short swqe in ibd_send() */ 529 kstat_named_t rc_swqe_short; 530 /* call mac_tx_update() when there is enough swqe */ 531 kstat_named_t rc_swqe_mac_update; 532 /* short large buf in ibd_send() */ 533 kstat_named_t rc_xmt_buf_short; 534 /* call mac_tx_update() when there is enough Tx large buffers */ 535 kstat_named_t rc_xmt_buf_mac_update; 536 537 kstat_named_t rc_conn_succ; /* # of success connect */ 538 kstat_named_t rc_conn_fail; /* # of fail connect */ 539 /* ace->ac_chan == NULL for unicast packet */ 540 kstat_named_t rc_null_conn; 541 /* not in active established state */ 542 kstat_named_t rc_no_estab_conn; 543 544 kstat_named_t rc_act_close; /* call ibd_rc_act_close() */ 545 kstat_named_t rc_pas_close; /* call ibd_rc_pas_close() */ 546 kstat_named_t rc_delay_ace_recycle; 547 kstat_named_t rc_act_close_simultaneous; 548 549 kstat_named_t rc_reset_cnt; /* # of Reset RC channel */ 550 } ibd_rc_stat_t; 551 #endif 552 553 typedef struct ibd_rc_chan_list_s { 554 /* This mutex protects chan_list and ibd_rc_chan_t.next */ 555 kmutex_t chan_list_mutex; 556 struct ibd_rc_chan_s *chan_list; 557 } ibd_rc_chan_list_t; 558 559 typedef struct ibd_rc_tx_largebuf_s { 560 struct ibd_rc_tx_largebuf_s *lb_next; 561 uint8_t *lb_buf; 562 } ibd_rc_tx_largebuf_t; 563 564 /* 565 * Pre-registered copybuf used for send and receive 566 */ 567 typedef struct ibd_copybuf_s { 568 ibt_wr_ds_t ic_sgl; 569 uint8_t *ic_bufaddr; 570 } ibd_copybuf_t; 571 572 typedef struct ibd_wqe_s { 573 struct ibd_wqe_s *w_next; 574 ibd_copybuf_t w_copybuf; 575 mblk_t *im_mblk; 576 } ibd_wqe_t; 577 578 /* 579 * Send WQE 580 */ 581 typedef struct ibd_swqe_s { 582 ibd_wqe_t w_ibd_swqe; 583 ibd_wqe_buftype_t w_buftype; 584 ibt_send_wr_t w_swr; 585 ibd_ace_t *w_ahandle; 586 ibt_mi_hdl_t w_mi_hdl; 587 ibt_wr_ds_t w_sgl[IBD_MAX_SQSEG]; 588 ibd_rc_tx_largebuf_t *w_rc_tx_largebuf; 589 } ibd_swqe_t; 590 591 #define swqe_next w_ibd_swqe.w_next 592 #define swqe_copybuf w_ibd_swqe.w_copybuf 593 #define swqe_im_mblk w_ibd_swqe.im_mblk 594 #define SWQE_TO_WQE(swqe) (ibd_wqe_t *)&((swqe)->w_ibd_swqe) 595 #define WQE_TO_SWQE(wqe) (ibd_swqe_t *)wqe 596 597 /* 598 * Receive WQE 599 */ 600 typedef struct ibd_rwqe_s { 601 ibd_wqe_t w_ibd_rwqe; 602 struct ibd_state_s *w_state; 603 ibt_recv_wr_t w_rwr; 604 frtn_t w_freemsg_cb; 605 boolean_t w_freeing_wqe; 606 struct ibd_rc_chan_s *w_chan; 607 } ibd_rwqe_t; 608 609 #define rwqe_next w_ibd_rwqe.w_next 610 #define rwqe_copybuf w_ibd_rwqe.w_copybuf 611 #define rwqe_im_mblk w_ibd_rwqe.im_mblk 612 #define RWQE_TO_WQE(rwqe) (ibd_wqe_t *)&((rwqe)->w_ibd_rwqe) 613 #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe 614 615 typedef struct ibd_list_s { 616 kmutex_t dl_mutex; 617 ibd_wqe_t *dl_head; 618 union { 619 boolean_t pending_sends; 620 uint32_t bufs_outstanding; 621 } ustat; 622 uint32_t dl_cnt; 623 } ibd_list_t; 624 625 #define dl_pending_sends ustat.pending_sends 626 #define dl_bufs_outstanding ustat.bufs_outstanding 627 628 /* 629 * LSO buffers 630 * 631 * Under normal circumstances we should never need to use any buffer 632 * that's larger than MTU. Unfortunately, IB HCA has limitations 633 * on the length of SGL that are much smaller than those for regular 634 * ethernet NICs. Since the network layer doesn't care to limit the 635 * number of mblk fragments in any send mp chain, we end up having to 636 * use these larger-than-MTU sized (larger than id_tx_buf_sz actually) 637 * buffers occasionally. 638 */ 639 typedef struct ibd_lsobuf_s { 640 struct ibd_lsobuf_s *lb_next; 641 uint8_t *lb_buf; 642 int lb_isfree; 643 } ibd_lsobuf_t; 644 645 typedef struct ibd_lsobkt_s { 646 uint8_t *bkt_mem; 647 ibd_lsobuf_t *bkt_bufl; 648 ibd_lsobuf_t *bkt_free_head; 649 ibt_mr_hdl_t bkt_mr_hdl; 650 ibt_mr_desc_t bkt_mr_desc; 651 uint_t bkt_nelem; 652 uint_t bkt_nfree; 653 } ibd_lsobkt_t; 654 655 /* 656 * Posting to a single software rx post queue is contentious, 657 * so break it out to (multiple) an array of queues. 658 * 659 * Try to ensure rx_queue structs fall in different cache lines using a filler. 660 * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes. 661 */ 662 #define RX_QUEUE_CACHE_LINE \ 663 (64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t))) 664 typedef struct ibd_rx_queue_s { 665 kmutex_t rx_post_lock; 666 ibd_wqe_t *rx_head; 667 uint_t rx_cnt; 668 uint8_t rx_pad[RX_QUEUE_CACHE_LINE]; 669 } ibd_rx_queue_t; 670 671 /* 672 * This structure maintains information per port per HCA 673 * (per network interface). 674 */ 675 typedef struct ibd_state_s { 676 dev_info_t *id_dip; 677 ibt_clnt_hdl_t id_ibt_hdl; 678 ibt_hca_hdl_t id_hca_hdl; 679 ibt_pd_hdl_t id_pd_hdl; 680 kmem_cache_t *id_req_kmc; 681 682 ibd_list_t id_tx_rel_list; 683 684 uint32_t id_running; 685 686 uint32_t id_max_sqseg; 687 uint32_t id_max_sqseg_hiwm; 688 ibd_list_t id_tx_list; 689 ddi_softintr_t id_tx; 690 uint32_t id_tx_sends; 691 692 kmutex_t id_txpost_lock; 693 ibd_swqe_t *id_tx_head; 694 ibd_swqe_t *id_tx_tail; 695 int id_tx_busy; 696 697 uint_t id_tx_buf_sz; 698 uint8_t *id_tx_bufs; 699 ibd_swqe_t *id_tx_wqes; 700 ibt_mr_hdl_t id_tx_mr_hdl; 701 ibt_mr_desc_t id_tx_mr_desc; 702 703 kmutex_t id_lso_lock; 704 ibd_lsobkt_t *id_lso; 705 706 kmutex_t id_scq_poll_lock; 707 int id_scq_poll_busy; 708 709 ibt_cq_hdl_t id_scq_hdl; 710 ibt_wc_t *id_txwcs; 711 uint32_t id_txwcs_size; 712 713 int id_rx_nqueues; 714 ibd_rx_queue_t *id_rx_queues; 715 int id_rx_post_queue_index; 716 uint32_t id_rx_post_active; 717 718 ibd_rwqe_t *id_rx_wqes; 719 uint8_t *id_rx_bufs; 720 ibt_mr_hdl_t id_rx_mr_hdl; 721 ibt_mr_desc_t id_rx_mr_desc; 722 uint_t id_rx_buf_sz; 723 uint32_t id_num_rwqe; 724 ibd_list_t id_rx_list; 725 ddi_softintr_t id_rx; 726 uint32_t id_rx_bufs_outstanding_limit; 727 uint32_t id_rx_allocb; 728 uint32_t id_rx_allocb_failed; 729 ibd_list_t id_rx_free_list; 730 731 kmutex_t id_rcq_poll_lock; 732 int id_rcq_poll_busy; 733 uint32_t id_rxwcs_size; 734 ibt_wc_t *id_rxwcs; 735 ibt_cq_hdl_t id_rcq_hdl; 736 737 ibt_channel_hdl_t id_chnl_hdl; 738 ib_pkey_t id_pkey; 739 uint16_t id_pkix; 740 uint8_t id_port; 741 ibt_mcg_info_t *id_mcinfo; 742 743 mac_handle_t id_mh; 744 mac_resource_handle_t id_rh; 745 ib_gid_t id_sgid; 746 ib_qpn_t id_qpnum; 747 ipoib_mac_t id_macaddr; 748 ib_gid_t id_mgid; 749 ipoib_mac_t id_bcaddr; 750 751 int id_mtu; 752 uchar_t id_scope; 753 754 kmutex_t id_acache_req_lock; 755 kcondvar_t id_acache_req_cv; 756 struct list id_req_list; 757 kt_did_t id_async_thrid; 758 759 kmutex_t id_ac_mutex; 760 ibd_ace_t *id_ac_hot_ace; 761 struct list id_ah_active; 762 struct list id_ah_free; 763 ipoib_mac_t id_ah_addr; 764 ibd_req_t id_ah_req; 765 char id_ah_op; 766 uint64_t id_ah_error; 767 ibd_ace_t *id_ac_list; 768 mod_hash_t *id_ah_active_hash; 769 770 kmutex_t id_mc_mutex; 771 struct list id_mc_full; 772 struct list id_mc_non; 773 774 kmutex_t id_trap_lock; 775 kcondvar_t id_trap_cv; 776 boolean_t id_trap_stop; 777 uint32_t id_trap_inprog; 778 779 char id_prom_op; 780 781 kmutex_t id_sched_lock; 782 int id_sched_needed; 783 int id_sched_cnt; 784 int id_sched_lso_cnt; 785 786 kmutex_t id_link_mutex; 787 link_state_t id_link_state; 788 uint64_t id_link_speed; 789 790 uint64_t id_num_intrs; 791 uint64_t id_tx_short; 792 uint32_t id_num_swqe; 793 794 uint64_t id_xmt_bytes; 795 uint64_t id_rcv_bytes; 796 uint64_t id_multi_xmt; 797 uint64_t id_brd_xmt; 798 uint64_t id_multi_rcv; 799 uint64_t id_brd_rcv; 800 uint64_t id_xmt_pkt; 801 uint64_t id_rcv_pkt; 802 803 uint32_t id_hwcksum_capab; 804 boolean_t id_lso_policy; 805 boolean_t id_lso_capable; 806 uint_t id_lso_maxlen; 807 int id_hca_res_lkey_capab; 808 ibt_lkey_t id_res_lkey; 809 810 boolean_t id_bgroup_created; 811 kmutex_t id_macst_lock; 812 kcondvar_t id_macst_cv; 813 uint32_t id_mac_state; 814 815 /* For Reliable Connected Mode */ 816 boolean_t id_enable_rc; 817 boolean_t rc_enable_srq; 818 819 int rc_mtu; 820 uint32_t rc_tx_max_sqseg; 821 /* 822 * In IPoIB over Reliable Connected mode, its mac address is added 823 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function 824 * ibd_process_rx(), the input mac address should not include the 825 * "IBD_MAC_ADDR_RC" prefix. 826 * 827 * So, we introduce the rc_macaddr_loopback for the loopback filter in 828 * IPoIB over Reliable Connected mode. 829 * 830 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix. 831 */ 832 ipoib_mac_t rc_macaddr_loopback; 833 834 ibt_srv_hdl_t rc_listen_hdl; 835 ibt_sbind_hdl_t rc_listen_bind; 836 ibt_srv_hdl_t rc_listen_hdl_OFED_interop; 837 ibt_sbind_hdl_t rc_listen_bind_OFED_interop; 838 839 ibd_rc_chan_list_t rc_pass_chan_list; 840 /* obsolete active channel list */ 841 ibd_rc_chan_list_t rc_obs_act_chan_list; 842 843 kmutex_t rc_ace_recycle_lock; 844 ibd_ace_t *rc_ace_recycle; 845 846 /* Send */ 847 /* 848 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree 849 * and ibd_rc_tx_largebuf_t->lb_next 850 */ 851 kmutex_t rc_tx_large_bufs_lock; 852 ibd_rc_tx_largebuf_t *rc_tx_largebuf_free_head; 853 uint_t rc_tx_largebuf_nfree; 854 /* The chunk of whole Tx large buffers */ 855 uint8_t *rc_tx_mr_bufs; 856 ibt_mr_hdl_t rc_tx_mr_hdl; 857 ibt_mr_desc_t rc_tx_mr_desc; 858 ibd_rc_tx_largebuf_t *rc_tx_largebuf_desc_base; /* base addr */ 859 860 boolean_t rc_enable_iov_map; 861 uint_t rc_max_sqseg_hiwm; 862 863 /* For SRQ */ 864 uint32_t rc_srq_size; 865 ibt_srq_hdl_t rc_srq_hdl; 866 ibd_list_t rc_srq_rwqe_list; 867 ibd_list_t rc_srq_free_list; 868 ibd_rwqe_t *rc_srq_rwqes; 869 uint8_t *rc_srq_rx_bufs; 870 ibt_mr_hdl_t rc_srq_rx_mr_hdl; 871 ibt_mr_desc_t rc_srq_rx_mr_desc; 872 873 /* For chained receive */ 874 kmutex_t rc_rx_lock; 875 mblk_t *rc_rx_mp; 876 mblk_t *rc_rx_mp_tail; 877 uint32_t rc_rx_mp_len; 878 879 /* Counters for RC mode */ 880 /* RX */ 881 /* 882 * # of Received packets. These packets are directly transferred to GLD 883 * without copy it 884 */ 885 uint64_t rc_rcv_trans_byte; 886 uint64_t rc_rcv_trans_pkt; 887 /* 888 * # of Received packets. We will allocate new buffers for these packet, 889 * copy their content into new buffers, then transfer to GLD 890 */ 891 uint64_t rc_rcv_copy_byte; 892 uint64_t rc_rcv_copy_pkt; 893 uint64_t rc_rcv_alloc_fail; 894 895 #ifdef DEBUG 896 uint64_t rc_rwqe_short; /* short rwqe */ 897 #endif 898 899 /* # of invoke Receive CQ handler */ 900 uint64_t rc_rcq_invoke; 901 /* wc->wc_status != IBT_WC_SUCCESS */ 902 uint64_t rc_rcq_err; 903 904 /* Tx */ 905 uint64_t rc_xmt_bytes; 906 907 /* pkt size <= ibd_rc_tx_copy_thresh */ 908 uint64_t rc_xmt_small_pkt; 909 uint64_t rc_xmt_fragmented_pkt; 910 /* fail in ibt_map_mem_iov() */ 911 uint64_t rc_xmt_map_fail_pkt; 912 /* succ in ibt_map_mem_iov() */ 913 uint64_t rc_xmt_map_succ_pkt; 914 915 uint64_t rc_ace_not_found; 916 917 uint64_t rc_xmt_drop_too_long_pkt; 918 uint64_t rc_xmt_icmp_too_long_pkt; 919 uint64_t rc_xmt_reenter_too_long_pkt; 920 921 /* short swqe in ibd_send() */ 922 uint64_t rc_swqe_short; 923 /* call mac_tx_update when there is enough swqe */ 924 uint64_t rc_swqe_mac_update; 925 /* short tx large copy buf in ibd_send() */ 926 uint64_t rc_xmt_buf_short; 927 /* call mac_tx_update when there is enough Tx copy buf */ 928 uint64_t rc_xmt_buf_mac_update; 929 930 /* No swqe even after call swqe recycle function */ 931 uint64_t rc_scq_no_swqe; 932 /* No large Tx buf even after call swqe recycle function */ 933 uint64_t rc_scq_no_largebuf; 934 /* # of invoke Send CQ handler */ 935 uint64_t rc_scq_invoke; 936 937 /* Connection setup and close */ 938 uint64_t rc_conn_succ; /* time of succ connect */ 939 uint64_t rc_conn_fail; /* time of fail connect */ 940 /* ace->ac_chan == NULL for unicast packet */ 941 uint64_t rc_null_conn; 942 /* not in active established state */ 943 uint64_t rc_no_estab_conn; 944 945 uint64_t rc_act_close; /* call ibd_rc_act_close() */ 946 uint64_t rc_pas_close; /* call ibd_rc_pas_close() */ 947 uint64_t rc_delay_ace_recycle; 948 uint64_t rc_act_close_simultaneous; 949 950 /* the counter of reset RC channel */ 951 uint64_t rc_reset_cnt; 952 953 #ifdef DEBUG 954 kstat_t *rc_ksp; 955 #endif 956 } ibd_state_t; 957 958 /* 959 * Structures to track global IBTF data, data that is shared 960 * among the IBD device instances. This includes the one ibt_hdl 961 * and the list of service registrations. 962 */ 963 typedef struct ibd_service_s { 964 struct ibd_service_s *is_link; 965 ibt_srv_hdl_t is_srv_hdl; 966 ib_svc_id_t is_sid; 967 uint_t is_ref_cnt; 968 } ibd_service_t; 969 970 typedef struct ibd_global_state_s { 971 kmutex_t ig_mutex; 972 ibt_clnt_hdl_t ig_ibt_hdl; 973 uint_t ig_ibt_hdl_ref_cnt; 974 ibd_service_t *ig_service_list; 975 } ibd_global_state_t; 976 977 typedef struct ibd_rc_msg_hello_s { 978 uint32_t reserved_qpn; 979 uint32_t rx_mtu; 980 } ibd_rc_msg_hello_t; 981 982 typedef struct ibd_rc_chan_s { 983 struct ibd_rc_chan_s *next; 984 /* channel hdl that we'll be using for Reliable Connected Mode */ 985 ibt_channel_hdl_t chan_hdl; 986 struct ibd_state_s *state; 987 ibd_ace_t *ace; 988 ibd_rc_chan_state_t chan_state; 989 990 /* used to judge duplicate connection */ 991 ib_gid_t requester_gid; 992 ib_pkey_t requester_pkey; 993 994 ibd_list_t tx_wqe_list; /* free wqe list */ 995 ibd_list_t tx_rel_list; /* for swqe recycle */ 996 997 ibd_swqe_t *tx_wqes; 998 999 /* start address of Tx Buffers */ 1000 uint8_t *tx_mr_bufs; 1001 ibt_mr_hdl_t tx_mr_hdl; 1002 ibt_mr_desc_t tx_mr_desc; 1003 1004 ibt_cq_hdl_t scq_hdl; /* Tx completion queue */ 1005 ibt_wc_t tx_wc[IBD_RC_MAX_CQ_WC]; 1006 ddi_softintr_t scq_softintr; 1007 1008 uint32_t tx_trans_error_cnt; 1009 1010 /* For chained send */ 1011 kmutex_t tx_post_lock; 1012 ibd_swqe_t *tx_head; 1013 ibd_swqe_t *tx_tail; 1014 int tx_busy; 1015 1016 /* For tx buffer recycle */ 1017 kmutex_t tx_poll_lock; 1018 int tx_poll_busy; 1019 1020 /* Rx */ 1021 ibd_list_t rx_wqe_list; /* used by ibt_post_recv */ 1022 ibd_list_t rx_free_list; /* free rwqe list */ 1023 1024 ibt_cq_hdl_t rcq_hdl; /* Rx completion queue */ 1025 ibt_wc_t rx_wc[IBD_RC_MAX_CQ_WC]; 1026 1027 ibd_rwqe_t *rx_rwqes; /* the chuck of whole rwqes */ 1028 uint8_t *rx_bufs; /* the chuck of whole Rx bufs */ 1029 ibt_mr_hdl_t rx_mr_hdl; /* ibt_mr_hdl_t for rx_bufs */ 1030 ibt_mr_desc_t rx_mr_desc; /* ibt_mr_desc_t for rx_bufs */ 1031 1032 /* For chained receive */ 1033 kmutex_t rx_lock; 1034 mblk_t *rx_mp; 1035 mblk_t *rx_mp_tail; 1036 uint32_t rx_mp_len; 1037 1038 uint32_t rcq_size; 1039 uint32_t scq_size; 1040 /* 1041 * We need two channels for each connection. 1042 * One channel for Tx; another channel for Rx. 1043 * If "is_tx_chan == B_TRUE", this is a Tx channel. 1044 */ 1045 boolean_t is_tx_chan; 1046 } ibd_rc_chan_t; 1047 1048 /* 1049 * The following functions are defined in "ibd.c". 1050 * They are also used by "ibd_cm.c" 1051 */ 1052 void ibd_print_warn(ibd_state_t *, char *, ...); 1053 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 1054 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 1055 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 1056 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 1057 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 1058 1059 /* 1060 * The following functions are defined in "ibd_cm.c". 1061 * They are also used in "ibd.c". 1062 */ 1063 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *); 1064 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *); 1065 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *); 1066 1067 /* Connection Setup/Close Functions */ 1068 ibt_status_t ibd_rc_listen(ibd_state_t *); 1069 void ibd_rc_stop_listen(ibd_state_t *); 1070 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *, 1071 uint64_t); 1072 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *); 1073 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *); 1074 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *); 1075 void ibd_rc_close_all_chan(ibd_state_t *); 1076 1077 /* Receive Functions */ 1078 int ibd_rc_init_srq_list(ibd_state_t *); 1079 void ibd_rc_fini_srq_list(ibd_state_t *); 1080 int ibd_rc_repost_srq_free_list(ibd_state_t *); 1081 1082 /* Send Functions */ 1083 int ibd_rc_init_tx_largebuf_list(ibd_state_t *); 1084 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *); 1085 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *); 1086 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *); 1087 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t); 1088 void ibd_rc_tx_cleanup(ibd_swqe_t *); 1089 1090 /* Others */ 1091 void ibd_rc_get_conf(ibd_state_t *); 1092 int ibd_rc_init_stats(ibd_state_t *); 1093 1094 #endif /* _KERNEL && !_BOOT */ 1095 1096 #ifdef __cplusplus 1097 } 1098 #endif 1099 1100 #endif /* _SYS_IB_CLIENTS_IBD_H */ 1101