1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/pattr.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Modes of hardware/driver/software checksum, useful for debugging 68 * and performance studies. 69 * 70 * none: h/w (Tavor) and driver does not do checksum, IP software must. 71 * partial: driver does data checksum, IP must provide psuedo header. 72 * perf_partial: driver uses IP provided psuedo cksum as data checksum 73 * (thus, real checksumming is not done). 74 */ 75 typedef enum { 76 IBD_CSUM_NONE, 77 IBD_CSUM_PARTIAL, 78 IBD_CSUM_PERF_PARTIAL 79 } ibd_csum_type_t; 80 81 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; 82 83 /* 84 * Per interface tunable parameters. 85 */ 86 uint_t ibd_rx_threshold = 16; 87 uint_t ibd_tx_current_copy_threshold = 0x10000000; 88 /* should less than max Tavor CQsize and be 2^n - 1 */ 89 uint_t ibd_num_rwqe = 511; 90 uint_t ibd_num_swqe = 511; 91 uint_t ibd_num_ah = 16; 92 uint_t ibd_hash_size = 16; 93 uint_t ibd_srv_fifos = 0x0; 94 uint_t ibd_fifo_depth = 0; 95 ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; 96 ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; 97 98 /* 99 * The driver can use separate CQs for send and receive queueus. 100 * While using separate CQs, it is possible to put the send CQ 101 * in polling mode, ie not to enable notifications on that CQ. 102 * If both CQs are interrupt driven, currently it is not possible 103 * for their handlers to be invoked concurrently (since Tavor ties 104 * both interrupts to the same PCI intr line); but the handlers 105 * are not coded with a single interrupt cpu assumption (eg 106 * id_num_intrs is incremented atomically). 107 * 108 * The driver private struct uses id_scq_hdl to track the separate 109 * CQ being used for send; the id_rcq_hdl tracks the receive CQ 110 * if using separate CQs, or it tracks the single CQ when using 111 * combined CQ. The id_wcs completion array is used in the combined 112 * CQ case, and for fetching Rx completions in the separate CQs case; 113 * the id_txwcs is used to fetch Tx completions in the separate CQs 114 * case. 115 */ 116 uint_t ibd_separate_cqs = 1; 117 uint_t ibd_txcomp_poll = 0; 118 119 /* 120 * the softintr is introduced to avoid Event Queue overflow. It 121 * should not have heavy load in CQ event handle function. 122 * If service fifos is enabled, this is not required, because 123 * mac_rx() will be called by service threads. 124 */ 125 uint_t ibd_rx_softintr = 1; 126 uint_t ibd_tx_softintr = 1; 127 128 /* 129 * Initial number of IBA resources allocated. 130 */ 131 #define IBD_NUM_RWQE ibd_num_rwqe 132 #define IBD_NUM_SWQE ibd_num_swqe 133 #define IBD_NUM_AH ibd_num_ah 134 135 /* when <= threshold, it's faster to copy to a premapped buffer */ 136 #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold 137 138 /* 139 * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will 140 * allocate a new WQE to put on the the rxlist. This value must be <= 141 * IBD_NUM_RWQE/id_num_rwqe. 142 */ 143 #define IBD_RX_THRESHOLD ibd_rx_threshold 144 145 /* 146 * Hash table size for the active AH list. 147 */ 148 #define IBD_HASH_SIZE ibd_hash_size 149 150 #define IBD_TXPOLL_THRESHOLD 64 151 /* 152 * PAD routine called during send/recv context 153 */ 154 #define IBD_SEND 0 155 #define IBD_RECV 1 156 157 /* 158 * fill / clear in <scope> and <p_key> in multicast/broadcast address. 159 */ 160 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 161 { \ 162 *(uint32_t *)((char *)(maddr) + 4) |= \ 163 htonl((uint32_t)(scope) << 16); \ 164 *(uint32_t *)((char *)(maddr) + 8) |= \ 165 htonl((uint32_t)(pkey) << 16); \ 166 } 167 168 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 169 { \ 170 *(uint32_t *)((char *)(maddr) + 4) &= \ 171 htonl(~((uint32_t)0xF << 16)); \ 172 *(uint32_t *)((char *)(maddr) + 8) &= \ 173 htonl(~((uint32_t)0xFFFF << 16)); \ 174 } 175 176 /* 177 * when free tx wqes >= threshold and reschedule flag is set, 178 * ibd will call mac_tx_update to re-enable Tx. 179 */ 180 #define IBD_TX_UPDATE_THRESHOLD 1 181 182 /* Driver State Pointer */ 183 void *ibd_list; 184 185 /* Required system entry points */ 186 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 187 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 188 189 /* Required driver entry points for GLDv3 */ 190 static int ibd_m_start(void *); 191 static void ibd_m_stop(void *); 192 static int ibd_m_unicst(void *, const uint8_t *); 193 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 194 static int ibd_m_promisc(void *, boolean_t); 195 static int ibd_m_stat(void *, uint_t, uint64_t *); 196 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 197 static mblk_t *ibd_m_tx(void *, mblk_t *); 198 199 /* Private driver entry points for GLDv3 */ 200 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 201 static uint_t ibd_intr(char *); 202 static uint_t ibd_tx_recycle(char *); 203 static int ibd_state_init(ibd_state_t *, dev_info_t *); 204 static void ibd_state_fini(ibd_state_t *); 205 static int ibd_drv_init(ibd_state_t *); 206 static void ibd_drv_fini(ibd_state_t *); 207 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 208 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 209 static void ibd_snet_notices_handler(void *, ib_gid_t, 210 ibt_subnet_event_code_t, ibt_subnet_event_t *); 211 static int ibd_init_txlist(ibd_state_t *); 212 static void ibd_fini_txlist(ibd_state_t *); 213 static int ibd_init_rxlist(ibd_state_t *); 214 static void ibd_fini_rxlist(ibd_state_t *); 215 static void ibd_freemsg_cb(char *); 216 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 217 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 218 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); 219 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 220 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 221 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 222 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 223 ibt_async_event_t *); 224 static int ibd_acache_init(ibd_state_t *); 225 static void ibd_acache_fini(ibd_state_t *); 226 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 227 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 228 static void ibd_async_unsetprom(ibd_state_t *); 229 static void ibd_async_setprom(ibd_state_t *); 230 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 231 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 232 static void ibd_async_txsched(ibd_state_t *); 233 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 234 static void ibd_async_work(ibd_state_t *); 235 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 236 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 237 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 238 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, 239 ipoib_mac_t *); 240 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 241 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); 242 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 243 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 244 static uint64_t ibd_get_portspeed(ibd_state_t *); 245 246 #ifdef RUN_PERFORMANCE 247 static void ibd_perf(ibd_state_t *); 248 #endif 249 250 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 251 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 252 253 /* Module Driver Info */ 254 static struct modldrv ibd_modldrv = { 255 &mod_driverops, /* This one is a driver */ 256 "InfiniBand GLDv3 Driver", /* short description */ 257 &ibd_dev_ops /* driver specific ops */ 258 }; 259 260 /* Module Linkage */ 261 static struct modlinkage ibd_modlinkage = { 262 MODREV_1, (void *)&ibd_modldrv, NULL 263 }; 264 265 /* 266 * Module Info passed to IBTL during IBT_ATTACH. 267 * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this 268 * data). 269 */ 270 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 271 IBTI_V_CURR, 272 IBT_NETWORK, 273 ibd_async_handler, 274 NULL, 275 "IPIB" 276 }; 277 278 /* 279 * Async operation types. 280 */ 281 #define ASYNC_GETAH 1 282 #define ASYNC_JOIN 2 283 #define ASYNC_LEAVE 3 284 #define ASYNC_PROMON 4 285 #define ASYNC_PROMOFF 5 286 #define ASYNC_REAP 6 287 #define ASYNC_TRAP 8 288 #define ASYNC_SCHED 9 289 #define ASYNC_LINK 10 290 #define ASYNC_EXIT 11 291 292 /* 293 * Async operation states 294 */ 295 #define NOTSTARTED 0 296 #define ONGOING 1 297 #define COMPLETED 2 298 #define ERRORED 3 299 #define ROUTERED 4 300 301 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF 302 303 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 304 static mac_callbacks_t ib_m_callbacks = { 305 IBD_M_CALLBACK_FLAGS, 306 ibd_m_stat, 307 ibd_m_start, 308 ibd_m_stop, 309 ibd_m_promisc, 310 ibd_m_multicst, 311 ibd_m_unicst, 312 ibd_m_tx, 313 NULL, 314 ibd_m_getcapab 315 }; 316 317 #ifdef DEBUG 318 319 static int rxpack = 1, txpack = 1; 320 int ibd_debuglevel = 100; 321 static void 322 debug_print(int l, char *fmt, ...) 323 { 324 va_list ap; 325 326 if (l < ibd_debuglevel) 327 return; 328 va_start(ap, fmt); 329 vcmn_err(CE_CONT, fmt, ap); 330 va_end(ap); 331 } 332 #define INCRXPACK (rxpack++) 333 #define INCTXPACK (txpack++) 334 #define DPRINT debug_print 335 336 #else /* DEBUG */ 337 338 #define INCRXPACK 0 339 #define INCTXPACK 0 340 #define DPRINT 341 342 #endif /* DEBUG */ 343 344 /* 345 * Common routine to print warning messages; adds in hca guid, port number 346 * and pkey to be able to identify the IBA interface. 347 */ 348 static void 349 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 350 { 351 ib_guid_t hca_guid; 352 char ibd_print_buf[256]; 353 int len; 354 va_list ap; 355 356 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 357 0, "hca-guid", 0); 358 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 359 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 360 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 361 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 362 va_start(ap, fmt); 363 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 364 fmt, ap); 365 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 366 va_end(ap); 367 } 368 369 /* warlock directives */ 370 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 371 ibd_state_t::id_ah_active)) 372 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) 373 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 374 ibd_state_t::id_req_list)) 375 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 376 ibd_state_t::id_acache_req_cv)) 377 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 378 ibd_state_t::id_mc_full)) 379 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 380 ibd_state_t::id_mc_non)) 381 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 382 ibd_state_t::id_link_state)) 383 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 384 ibd_state_s::id_tx_list)) 385 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 386 ibd_state_s::id_rx_list)) 387 388 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) 389 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) 390 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) 391 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) 392 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) 393 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) 394 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) 395 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) 396 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) 397 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) 398 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) 399 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) 400 401 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) 402 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) 403 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) 404 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) 405 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) 406 407 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) 408 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) 409 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) 410 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) 411 412 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) 413 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) 414 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) 415 416 #ifdef DEBUG 417 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) 418 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) 419 #endif 420 421 int 422 _init() 423 { 424 int status; 425 426 /* 427 * Sanity check some parameter settings. Tx completion polling 428 * only makes sense with separate CQs for Tx and Rx. 429 */ 430 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 431 cmn_err(CE_NOTE, "!ibd: %s", 432 "Setting ibd_txcomp_poll = 0 for combined CQ"); 433 ibd_txcomp_poll = 0; 434 } 435 436 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 437 if (status != 0) { 438 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 439 return (status); 440 } 441 442 mac_init_ops(&ibd_dev_ops, "ibd"); 443 status = mod_install(&ibd_modlinkage); 444 if (status != 0) { 445 DPRINT(10, "_init:failed in mod_install()"); 446 ddi_soft_state_fini(&ibd_list); 447 mac_fini_ops(&ibd_dev_ops); 448 return (status); 449 } 450 451 return (0); 452 } 453 454 int 455 _info(struct modinfo *modinfop) 456 { 457 return (mod_info(&ibd_modlinkage, modinfop)); 458 } 459 460 int 461 _fini() 462 { 463 int status; 464 465 status = mod_remove(&ibd_modlinkage); 466 if (status != 0) 467 return (status); 468 469 mac_fini_ops(&ibd_dev_ops); 470 ddi_soft_state_fini(&ibd_list); 471 return (0); 472 } 473 474 /* 475 * Convert the GID part of the mac address from network byte order 476 * to host order. 477 */ 478 static void 479 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 480 { 481 ib_sn_prefix_t nbopref; 482 ib_guid_t nboguid; 483 484 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 485 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 486 dgid->gid_prefix = b2h64(nbopref); 487 dgid->gid_guid = b2h64(nboguid); 488 } 489 490 /* 491 * Create the IPoIB address in network byte order from host order inputs. 492 */ 493 static void 494 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 495 ib_guid_t guid) 496 { 497 ib_sn_prefix_t nbopref; 498 ib_guid_t nboguid; 499 500 mac->ipoib_qpn = htonl(qpn); 501 nbopref = h2b64(prefix); 502 nboguid = h2b64(guid); 503 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 504 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 505 } 506 507 /* 508 * Send to the appropriate all-routers group when the IBA multicast group 509 * does not exist, based on whether the target group is v4 or v6. 510 */ 511 static boolean_t 512 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 513 ipoib_mac_t *rmac) 514 { 515 boolean_t retval = B_TRUE; 516 uint32_t adjscope = state->id_scope << 16; 517 uint32_t topword; 518 519 /* 520 * Copy the first 4 bytes in without assuming any alignment of 521 * input mac address; this will have IPoIB signature, flags and 522 * scope bits. 523 */ 524 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 525 topword = ntohl(topword); 526 527 /* 528 * Generate proper address for IPv4/v6, adding in the Pkey properly. 529 */ 530 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 531 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 532 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 533 ((uint32_t)(state->id_pkey << 16))), 534 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 535 else 536 /* 537 * Does not have proper bits in the mgid address. 538 */ 539 retval = B_FALSE; 540 541 return (retval); 542 } 543 544 /* 545 * Implementation of various (software) flavors of send and receive side 546 * checksumming. 547 */ 548 #define IBD_CKSUM_SEND(mp) { \ 549 uint32_t start, stuff, end, value, flags; \ 550 uint32_t cksum, sum; \ 551 uchar_t *dp, *buf; \ 552 uint16_t *up; \ 553 \ 554 if (ibd_csum_send == IBD_CSUM_NONE) \ 555 goto punt_send; \ 556 \ 557 /* \ 558 * Query IP whether Tx cksum needs to be done. \ 559 */ \ 560 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ 561 &value, &flags); \ 562 \ 563 if (flags == HCK_PARTIALCKSUM) { \ 564 dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ 565 up = (uint16_t *)(dp + stuff); \ 566 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 567 end = ((uchar_t *)mp->b_wptr - dp - start); \ 568 cksum = *up; \ 569 *up = 0; \ 570 /* \ 571 * Does NOT handle chained mblks/more than one \ 572 * SGL. Applicable only for a single SGL \ 573 * entry/mblk, where the stuff offset is \ 574 * within the range of buf. \ 575 */ \ 576 buf = (dp + start); \ 577 sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ 578 } else { \ 579 sum = *up; \ 580 } \ 581 DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ 582 start, stuff, end, sum, cksum); \ 583 sum = ~(sum); \ 584 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 585 } \ 586 punt_send: \ 587 ; \ 588 } 589 590 #define IBD_CKSUM_RECV(mp) { \ 591 uchar_t *dp, *buf; \ 592 uint32_t start, end, value, stuff, flags; \ 593 uint16_t *up, frag; \ 594 ipha_t *iphp; \ 595 ipoib_hdr_t *ipibh; \ 596 \ 597 if (ibd_csum_recv == IBD_CSUM_NONE) \ 598 goto punt_recv; \ 599 \ 600 ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ 601 if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ 602 goto punt_recv; \ 603 \ 604 dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ 605 iphp = (ipha_t *)dp; \ 606 frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ 607 if ((frag) & (~IPH_DF)) \ 608 goto punt_recv; \ 609 start = IPH_HDR_LENGTH(iphp); \ 610 if (iphp->ipha_protocol == IPPROTO_TCP) \ 611 stuff = start + 16; \ 612 else if (iphp->ipha_protocol == IPPROTO_UDP) \ 613 stuff = start + 6; \ 614 else \ 615 goto punt_recv; \ 616 \ 617 flags = HCK_PARTIALCKSUM; \ 618 end = ntohs(iphp->ipha_length); \ 619 up = (uint16_t *)(dp + stuff); \ 620 \ 621 if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ 622 buf = (dp + start); \ 623 value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ 624 } else { \ 625 value = (*up); \ 626 } \ 627 if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ 628 value, flags, 0) != 0) \ 629 DPRINT(10, "cksum_recv: value: %x\n", value); \ 630 punt_recv: \ 631 ; \ 632 } 633 634 /* 635 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 636 * front of optional src/tgt link layer address. Right now Solaris inserts 637 * padding by default at the end. The routine which is doing is nce_xmit() 638 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 639 * the packet comes down from IP layer to the IBD driver, it is in the 640 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 641 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 642 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 643 * 644 * The send routine at IBD driver changes this packet as follows: 645 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 646 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 647 * aligned. 648 * 649 * At the receiving side again ibd_process_rx takes the above packet and 650 * removes the two bytes of front padding and inserts it at the end. This 651 * is since the IP layer does not understand padding at the front. 652 */ 653 #define IBD_PAD_NSNA(ip6h, len, type) { \ 654 uchar_t *nd_lla_ptr; \ 655 icmp6_t *icmp6; \ 656 nd_opt_hdr_t *opt; \ 657 int i; \ 658 \ 659 icmp6 = (icmp6_t *)&ip6h[1]; \ 660 len -= sizeof (nd_neighbor_advert_t); \ 661 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 662 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 663 (len != 0)) { \ 664 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 665 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 666 ASSERT(opt != NULL); \ 667 nd_lla_ptr = (uchar_t *)&opt[1]; \ 668 if (type == 0) { \ 669 for (i = IPOIB_ADDRL; i > 0; i--) \ 670 *(nd_lla_ptr + i + 1) = \ 671 *(nd_lla_ptr + i - 1); \ 672 } else { \ 673 for (i = 0; i < IPOIB_ADDRL; i++) \ 674 *(nd_lla_ptr + i) = \ 675 *(nd_lla_ptr + i + 2); \ 676 } \ 677 *(nd_lla_ptr + i) = 0; \ 678 *(nd_lla_ptr + i + 1) = 0; \ 679 } \ 680 } 681 682 /* 683 * The service fifo code is copied verbatim from Cassini. This can be 684 * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. 685 */ 686 687 typedef caddr_t fifo_obj_t, *p_fifo_obj_t; 688 689 typedef struct _srv_fifo_t { 690 kmutex_t fifo_lock; 691 kcondvar_t fifo_cv; 692 size_t size; 693 uint_t max_index; 694 uint_t rd_index; 695 uint_t wr_index; 696 uint_t objs_pending; 697 p_fifo_obj_t fifo_objs; 698 kthread_t *fifo_thread; 699 void (*drain_func)(caddr_t drain_func_arg); 700 caddr_t drain_func_arg; 701 boolean_t running; 702 callb_cpr_t cprinfo; 703 } srv_fifo_t, *p_srv_fifo_t; 704 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) 705 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) 706 707 static int 708 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, 709 void (*drain_func)(), caddr_t drain_func_arg) 710 { 711 int status; 712 p_srv_fifo_t srv_fifo; 713 714 status = DDI_SUCCESS; 715 srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); 716 srv_fifo->size = size; 717 srv_fifo->max_index = size - 1; 718 srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( 719 size * sizeof (fifo_obj_t), KM_SLEEP); 720 mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); 721 cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); 722 srv_fifo->drain_func = drain_func; 723 srv_fifo->drain_func_arg = drain_func_arg; 724 srv_fifo->running = DDI_SUCCESS; 725 srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, 726 (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); 727 if (srv_fifo->fifo_thread == NULL) { 728 cv_destroy(&srv_fifo->fifo_cv); 729 mutex_destroy(&srv_fifo->fifo_lock); 730 kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); 731 kmem_free(srv_fifo, sizeof (srv_fifo_t)); 732 srv_fifo = NULL; 733 status = DDI_FAILURE; 734 } else 735 *handle = srv_fifo; 736 return (status); 737 } 738 739 static void 740 _ddi_srv_fifo_destroy(p_srv_fifo_t handle) 741 { 742 kt_did_t tid = handle->fifo_thread->t_did; 743 744 mutex_enter(&handle->fifo_lock); 745 handle->running = DDI_FAILURE; 746 cv_signal(&handle->fifo_cv); 747 while (handle->running == DDI_FAILURE) 748 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 749 mutex_exit(&handle->fifo_lock); 750 if (handle->objs_pending != 0) 751 cmn_err(CE_NOTE, "!Thread Exit with work undone."); 752 cv_destroy(&handle->fifo_cv); 753 mutex_destroy(&handle->fifo_lock); 754 kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); 755 kmem_free(handle, sizeof (srv_fifo_t)); 756 thread_join(tid); 757 } 758 759 static caddr_t 760 _ddi_srv_fifo_begin(p_srv_fifo_t handle) 761 { 762 #ifndef __lock_lint 763 CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, 764 callb_generic_cpr, "srv_fifo"); 765 #endif /* ! _lock_lint */ 766 return (handle->drain_func_arg); 767 } 768 769 static void 770 _ddi_srv_fifo_end(p_srv_fifo_t handle) 771 { 772 callb_cpr_t cprinfo; 773 774 mutex_enter(&handle->fifo_lock); 775 cprinfo = handle->cprinfo; 776 handle->running = DDI_SUCCESS; 777 cv_signal(&handle->fifo_cv); 778 #ifndef __lock_lint 779 CALLB_CPR_EXIT(&cprinfo); 780 #endif /* ! _lock_lint */ 781 thread_exit(); 782 _NOTE(NOT_REACHED) 783 } 784 785 static int 786 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) 787 { 788 int status; 789 790 mutex_enter(&handle->fifo_lock); 791 status = handle->running; 792 if (status == DDI_SUCCESS) { 793 if (ptr) { 794 if (handle->objs_pending < handle->size) { 795 if (handle->wr_index == handle->max_index) 796 handle->wr_index = 0; 797 else 798 handle->wr_index++; 799 handle->fifo_objs[handle->wr_index] = ptr; 800 handle->objs_pending++; 801 } else 802 status = DDI_FAILURE; 803 if (signal) 804 cv_signal(&handle->fifo_cv); 805 } else { 806 if (signal && (handle->objs_pending > 0)) 807 cv_signal(&handle->fifo_cv); 808 } 809 } 810 mutex_exit(&handle->fifo_lock); 811 return (status); 812 } 813 814 static int 815 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) 816 { 817 int status; 818 819 mutex_enter(&handle->fifo_lock); 820 status = handle->running; 821 if (status == DDI_SUCCESS) { 822 if (handle->objs_pending == 0) { 823 #ifndef __lock_lint 824 CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); 825 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 826 CALLB_CPR_SAFE_END(&handle->cprinfo, 827 &handle->fifo_lock); 828 #endif /* !_lock_lint */ 829 *ptr = NULL; 830 } 831 if (handle->objs_pending > 0) { 832 if (handle->rd_index == handle->max_index) 833 handle->rd_index = 0; 834 else 835 handle->rd_index++; 836 *ptr = handle->fifo_objs[handle->rd_index]; 837 handle->objs_pending--; 838 } 839 status = handle->running; 840 } else { 841 if (handle->objs_pending) { 842 if (handle->rd_index == handle->max_index) 843 handle->rd_index = 0; 844 else 845 handle->rd_index++; 846 *ptr = handle->fifo_objs[handle->rd_index]; 847 handle->objs_pending--; 848 status = DDI_SUCCESS; 849 } else 850 status = DDI_FAILURE; 851 } 852 mutex_exit(&handle->fifo_lock); 853 return (status); 854 } 855 856 /* 857 * [un]map_rx_srv_fifos has been modified from its CE version. 858 */ 859 static void 860 drain_fifo(p_srv_fifo_t handle) 861 { 862 ibd_state_t *state; 863 mblk_t *mp; 864 865 state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); 866 while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { 867 /* 868 * Hand off to GLDv3. 869 */ 870 IBD_CKSUM_RECV(mp); 871 mac_rx(state->id_mh, NULL, mp); 872 } 873 _ddi_srv_fifo_end(handle); 874 } 875 876 static p_srv_fifo_t * 877 map_rx_srv_fifos(int *nfifos, void *private) 878 { 879 p_srv_fifo_t *srv_fifos; 880 int i, inst_taskqs, depth; 881 882 /* 883 * Default behavior on both sparc and amd cpus in terms of 884 * of worker thread is as follows: (N) indicates worker thread 885 * not enabled , (Y) indicates worker thread enabled. Default of 886 * ibd_srv_fifo is set to 0xffff. The default behavior can be 887 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below. 888 * Worker thread model assigns lower priority to network 889 * processing making system more usable at higher network 890 * loads. 891 * ________________________________________________________ 892 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff | 893 * |----------------------|---|---|-------|---|---|---------| 894 * | | Sparc | x86 | 895 * |----------------------|---|---|-------|---|---|---------| 896 * | Single CPU |N | Y | N | N | Y | N | 897 * |----------------------|---|---|-------|---|---|---------| 898 * | Multi CPU |N | Y | Y | N | Y | Y | 899 * |______________________|___|___|_______|___|___|_________| 900 */ 901 if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || 902 (ibd_srv_fifos == 0)) { 903 *nfifos = 0; 904 return ((p_srv_fifo_t *)1); 905 } 906 907 *nfifos = inst_taskqs; 908 srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), 909 KM_SLEEP); 910 911 /* 912 * If the administrator has specified a fifo depth, use 913 * that, else just decide what should be the depth. 914 */ 915 if (ibd_fifo_depth == 0) 916 depth = (IBD_NUM_RWQE / inst_taskqs) + 16; 917 else 918 depth = ibd_fifo_depth; 919 920 for (i = 0; i < inst_taskqs; i++) 921 if (_ddi_srv_fifo_create(&srv_fifos[i], 922 depth, drain_fifo, 923 (caddr_t)private) != DDI_SUCCESS) 924 break; 925 926 if (i < inst_taskqs) 927 goto map_rx_srv_fifos_fail1; 928 929 goto map_rx_srv_fifos_exit; 930 931 map_rx_srv_fifos_fail1: 932 i--; 933 for (; i >= 0; i--) { 934 _ddi_srv_fifo_destroy(srv_fifos[i]); 935 } 936 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 937 srv_fifos = NULL; 938 939 map_rx_srv_fifos_exit: 940 return (srv_fifos); 941 } 942 943 static void 944 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) 945 { 946 int i; 947 948 /* 949 * If this interface was not using service fifos, quickly return. 950 */ 951 if (inst_taskqs == 0) 952 return; 953 954 for (i = 0; i < inst_taskqs; i++) { 955 _ddi_srv_fifo_destroy(srv_fifos[i]); 956 } 957 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 958 } 959 960 /* 961 * Choose between sending up the packet directly and handing off 962 * to a service thread. 963 */ 964 static void 965 ibd_send_up(ibd_state_t *state, mblk_t *mp) 966 { 967 p_srv_fifo_t *srvfifo; 968 ipoib_hdr_t *lhdr; 969 struct ip *ip_hdr; 970 struct udphdr *tran_hdr; 971 uchar_t prot; 972 int tnum = -1, nfifos = state->id_nfifos; 973 974 /* 975 * Quick path if the interface is not using service fifos. 976 */ 977 if (nfifos == 0) { 978 hand_off: 979 IBD_CKSUM_RECV(mp); 980 mac_rx(state->id_mh, NULL, mp); 981 return; 982 } 983 984 /* 985 * Is the packet big enough to look at the IPoIB header 986 * and basic IP header to determine whether it is an 987 * IPv4 packet? 988 */ 989 if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + 990 sizeof (struct ip))) { 991 992 lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); 993 994 /* 995 * Is the packet an IP(v4) packet? 996 */ 997 if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { 998 999 ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + 1000 IPOIB_HDRSIZE); 1001 prot = ip_hdr->ip_p; 1002 1003 /* 1004 * TCP or UDP packet? We use the UDP header, since 1005 * the first few words of both headers are laid out 1006 * similarly (src/dest ports). 1007 */ 1008 if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { 1009 1010 tran_hdr = (struct udphdr *)( 1011 (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); 1012 1013 /* 1014 * Are we within limits of this packet? If 1015 * so, use the destination port to hash to 1016 * a service thread. 1017 */ 1018 if (mp->b_wptr >= ((uchar_t *)tran_hdr + 1019 sizeof (*tran_hdr))) 1020 tnum = (ntohs(tran_hdr->uh_dport) + 1021 ntohs(tran_hdr->uh_sport)) % 1022 nfifos; 1023 } 1024 } 1025 } 1026 1027 /* 1028 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the 1029 * packet up in interrupt context, reducing latency. 1030 */ 1031 if (tnum == -1) { 1032 goto hand_off; 1033 } 1034 1035 srvfifo = (p_srv_fifo_t *)state->id_fifos; 1036 if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, 1037 B_TRUE) != DDI_SUCCESS) 1038 freemsg(mp); 1039 } 1040 1041 /* 1042 * Address handle entries maintained by the driver are kept in the 1043 * free and active lists. Each entry starts out in the free list; 1044 * it migrates to the active list when primed using ibt_get_paths() 1045 * and ibt_modify_ud_dest() for transmission to a specific destination. 1046 * In the active list, the entry has a reference count indicating the 1047 * number of ongoing/uncompleted transmits that reference it. The 1048 * entry is left in the active list even after the reference count 1049 * goes to 0, since successive transmits can find it there and do 1050 * not need to set up another entry (ie the path information is 1051 * cached using the active list). Entries on the active list are 1052 * also hashed using the destination link address as a key for faster 1053 * lookups during transmits. 1054 * 1055 * For any destination address (unicast or multicast, whatever the 1056 * join states), there will be at most one entry in the active list. 1057 * Entries with a 0 reference count on the active list can be reused 1058 * for a transmit to a new destination, if the free list is empty. 1059 * 1060 * The AH free list insertion/deletion is protected with the id_ac_mutex, 1061 * since the async thread and Tx callback handlers insert/delete. The 1062 * active list does not need a lock (all operations are done by the 1063 * async thread) but updates to the reference count are atomically 1064 * done (increments done by Tx path, decrements by the Tx callback handler). 1065 */ 1066 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 1067 list_insert_head(&state->id_ah_free, ce) 1068 #define IBD_ACACHE_GET_FREE(state) \ 1069 list_get_head(&state->id_ah_free) 1070 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 1071 int _ret_; \ 1072 list_insert_head(&state->id_ah_active, ce); \ 1073 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 1074 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1075 ASSERT(_ret_ == 0); \ 1076 } 1077 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 1078 list_remove(&state->id_ah_active, ce); \ 1079 (void) mod_hash_remove(state->id_ah_active_hash, \ 1080 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1081 } 1082 #define IBD_ACACHE_GET_ACTIVE(state) \ 1083 list_get_head(&state->id_ah_active) 1084 1085 /* 1086 * Membership states for different mcg's are tracked by two lists: 1087 * the "non" list is used for promiscuous mode, when all mcg traffic 1088 * needs to be inspected. This type of membership is never used for 1089 * transmission, so there can not be an AH in the active list 1090 * corresponding to a member in this list. This list does not need 1091 * any protection, since all operations are performed by the async 1092 * thread. 1093 * 1094 * "Full" and "SendOnly" membership is tracked using a single list, 1095 * the "full" list. This is because this single list can then be 1096 * searched during transmit to a multicast group (if an AH for the 1097 * mcg is not found in the active list), since at least one type 1098 * of membership must be present before initiating the transmit. 1099 * This list is also emptied during driver detach, since sendonly 1100 * membership acquired during transmit is dropped at detach time 1101 * alongwith ipv4 broadcast full membership. Insert/deletes to 1102 * this list are done only by the async thread, but it is also 1103 * searched in program context (see multicast disable case), thus 1104 * the id_mc_mutex protects the list. The driver detach path also 1105 * deconstructs the "full" list, but it ensures that the async 1106 * thread will not be accessing the list (by blocking out mcg 1107 * trap handling and making sure no more Tx reaping will happen). 1108 * 1109 * Currently, an IBA attach is done in the SendOnly case too, 1110 * although this is not required. 1111 */ 1112 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1113 list_insert_head(&state->id_mc_full, mce) 1114 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1115 list_insert_head(&state->id_mc_non, mce) 1116 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1117 ibd_mcache_find(mgid, &state->id_mc_full) 1118 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1119 ibd_mcache_find(mgid, &state->id_mc_non) 1120 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1121 list_remove(&state->id_mc_full, mce) 1122 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1123 list_remove(&state->id_mc_non, mce) 1124 1125 /* 1126 * AH and MCE active list manipulation: 1127 * 1128 * Multicast disable requests and MCG delete traps are two cases 1129 * where the active AH entry for the mcg (if any unreferenced one exists) 1130 * will be moved to the free list (to force the next Tx to the mcg to 1131 * join the MCG in SendOnly mode). Port up handling will also move AHs 1132 * from active to free list. 1133 * 1134 * In the case when some transmits are still pending on an entry 1135 * for an mcg, but a multicast disable has already been issued on the 1136 * mcg, there are some options to consider to preserve the join state 1137 * to ensure the emitted packet is properly routed on the IBA fabric. 1138 * For the AH, we can 1139 * 1. take out of active list at multicast disable time. 1140 * 2. take out of active list only when last pending Tx completes. 1141 * For the MCE, we can 1142 * 3. take out of active list at multicast disable time. 1143 * 4. take out of active list only when last pending Tx completes. 1144 * 5. move from active list to stale list at multicast disable time. 1145 * We choose to use 2,4. We use option 4 so that if a multicast enable 1146 * is tried before the pending Tx completes, the enable code finds the 1147 * mce in the active list and just has to make sure it will not be reaped 1148 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1149 * a stale list (#5) that would be checked in the enable code would need 1150 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1151 * after the multicast disable would try to put an AH in the active list, 1152 * and associate the mce it finds in the active list to this new AH, 1153 * whereas the mce is already associated with the previous AH (taken off 1154 * the active list), and will be removed once the pending Tx's complete 1155 * (unless a reference count on mce's is implemented). One implication of 1156 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1157 * grab new references on the AH, further delaying the leave. 1158 * 1159 * In the case of mcg delete (or create) trap when the port is sendonly 1160 * joined, the AH and MCE handling is different: the AH and MCE has to be 1161 * immediately taken off the active lists (forcing a join and path lookup 1162 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1163 * to an mcg as it is repeatedly created and deleted and goes thru 1164 * reincarnations). 1165 * 1166 * When a port is already sendonly joined, and a multicast enable is 1167 * attempted, the same mce structure is promoted; this ensures only a 1168 * single mce on the active list tracks the most powerful join state. 1169 * 1170 * In the case of port up event handling, the MCE for sendonly membership 1171 * is freed up, and the ACE is put into the free list as soon as possible 1172 * (depending on whether posted Tx's have completed). For fullmembership 1173 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1174 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1175 * done; else the mce is deconstructed (mc_fullreap case). 1176 * 1177 * MCG creation and deletion trap handling: 1178 * 1179 * These traps are unreliable (meaning sometimes the trap might never 1180 * be delivered to the subscribed nodes) and may arrive out-of-order 1181 * since they use UD transport. An alternative to relying on these 1182 * unreliable traps is to poll for mcg presence every so often, but 1183 * instead of doing that, we try to be as conservative as possible 1184 * while handling the traps, and hope that the traps do arrive at 1185 * the subscribed nodes soon. Note that if a node is fullmember 1186 * joined to an mcg, it can not possibly receive a mcg create/delete 1187 * trap for that mcg (by fullmember definition); if it does, it is 1188 * an old trap from a previous incarnation of the mcg. 1189 * 1190 * Whenever a trap is received, the driver cleans up its sendonly 1191 * membership to the group; we choose to do a sendonly leave even 1192 * on a creation trap to handle the case of a prior deletion of the mcg 1193 * having gone unnoticed. Consider an example scenario: 1194 * T1: MCG M is deleted, and fires off deletion trap D1. 1195 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1196 * T3: Node N tries to transmit to M, joining in sendonly mode. 1197 * T4: MCG M is deleted, and fires off deletion trap D2. 1198 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1199 * If the trap is D2, then a LEAVE is not required, since the mcg 1200 * is already deleted; but if it is D1, a LEAVE is required. A safe 1201 * approach is to always LEAVE, but the SM may be confused if it 1202 * receives a LEAVE without a prior JOIN. 1203 * 1204 * Management of the non-membership to an mcg is similar to the above, 1205 * except that if the interface is in promiscuous mode, it is required 1206 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1207 * if the re-join attempt fails (in which case a warning message needs 1208 * to be printed), it is not clear whether it failed due to the mcg not 1209 * existing, or some fabric/hca issues, due to the delayed nature of 1210 * trap delivery. Querying the SA to establish presence/absence of the 1211 * mcg is also racy at best. Thus, the driver just prints a warning 1212 * message when it can not rejoin after receiving a create trap, although 1213 * this might be (on rare occassions) a mis-warning if the create trap is 1214 * received after the mcg was deleted. 1215 */ 1216 1217 /* 1218 * Implementation of atomic "recycle" bits and reference count 1219 * on address handles. This utilizes the fact that max reference 1220 * count on any handle is limited by number of send wqes, thus 1221 * high bits in the ac_ref field can be used as the recycle bits, 1222 * and only the low bits hold the number of pending Tx requests. 1223 * This atomic AH reference counting allows the Tx completion 1224 * handler not to acquire the id_ac_mutex to process every completion, 1225 * thus reducing lock contention problems between completion and 1226 * the Tx path. 1227 */ 1228 #define CYCLEVAL 0x80000 1229 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1230 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1231 #define GET_REF(ace) ((ace)->ac_ref) 1232 #define GET_REF_CYCLE(ace) ( \ 1233 /* \ 1234 * Make sure "cycle" bit is set. \ 1235 */ \ 1236 ASSERT(CYCLE_SET(ace)), \ 1237 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1238 ) 1239 #define INC_REF(ace, num) { \ 1240 atomic_add_32(&(ace)->ac_ref, num); \ 1241 } 1242 #define SET_CYCLE_IF_REF(ace) ( \ 1243 CYCLE_SET(ace) ? B_TRUE : \ 1244 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1245 CYCLEVAL ? \ 1246 /* \ 1247 * Clear the "cycle" bit we just set; \ 1248 * ref count known to be 0 from above. \ 1249 */ \ 1250 CLEAR_REFCYCLE(ace), B_FALSE : \ 1251 /* \ 1252 * We set "cycle" bit; let caller know. \ 1253 */ \ 1254 B_TRUE \ 1255 ) 1256 #define DEC_REF_DO_CYCLE(ace) ( \ 1257 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1258 CYCLEVAL ? \ 1259 /* \ 1260 * Ref count known to be 0 from above. \ 1261 */ \ 1262 B_TRUE : \ 1263 B_FALSE \ 1264 ) 1265 1266 static void * 1267 list_get_head(list_t *list) 1268 { 1269 list_node_t *lhead = list_head(list); 1270 1271 if (lhead != NULL) 1272 list_remove(list, lhead); 1273 return (lhead); 1274 } 1275 1276 /* 1277 * This is always guaranteed to be able to queue the work. 1278 */ 1279 static void 1280 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1281 { 1282 /* Initialize request */ 1283 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1284 ptr->rq_op = op; 1285 1286 /* 1287 * Queue provided slot onto request pool. 1288 */ 1289 mutex_enter(&state->id_acache_req_lock); 1290 list_insert_tail(&state->id_req_list, ptr); 1291 1292 /* Go, fetch, async thread */ 1293 cv_signal(&state->id_acache_req_cv); 1294 mutex_exit(&state->id_acache_req_lock); 1295 } 1296 1297 /* 1298 * Main body of the per interface async thread. 1299 */ 1300 static void 1301 ibd_async_work(ibd_state_t *state) 1302 { 1303 ibd_req_t *ptr; 1304 callb_cpr_t cprinfo; 1305 1306 mutex_enter(&state->id_acache_req_lock); 1307 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1308 callb_generic_cpr, "ibd_async_work"); 1309 for (;;) { 1310 ptr = list_get_head(&state->id_req_list); 1311 if (ptr != NULL) { 1312 mutex_exit(&state->id_acache_req_lock); 1313 1314 /* 1315 * Once we have done the operation, there is no 1316 * guarantee the request slot is going to be valid, 1317 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). 1318 */ 1319 1320 /* Perform the request */ 1321 switch (ptr->rq_op) { 1322 case ASYNC_GETAH: 1323 ibd_async_acache(state, &ptr->rq_mac); 1324 break; 1325 case ASYNC_REAP: 1326 ibd_async_reap_group(state, 1327 ptr->rq_ptr, ptr->rq_gid, 1328 IB_MC_JSTATE_FULL); 1329 /* 1330 * the req buf contains in mce 1331 * structure, so we do not need 1332 * to free it here. 1333 */ 1334 ptr = NULL; 1335 break; 1336 case ASYNC_LEAVE: 1337 case ASYNC_JOIN: 1338 ibd_async_multicast(state, 1339 ptr->rq_gid, ptr->rq_op); 1340 break; 1341 case ASYNC_PROMON: 1342 ibd_async_setprom(state); 1343 break; 1344 case ASYNC_PROMOFF: 1345 ibd_async_unsetprom(state); 1346 break; 1347 case ASYNC_TRAP: 1348 ibd_async_trap(state, ptr); 1349 break; 1350 case ASYNC_SCHED: 1351 ibd_async_txsched(state); 1352 break; 1353 case ASYNC_LINK: 1354 ibd_async_link(state, ptr); 1355 break; 1356 case ASYNC_EXIT: 1357 mutex_enter(&state->id_acache_req_lock); 1358 #ifndef __lock_lint 1359 CALLB_CPR_EXIT(&cprinfo); 1360 #endif /* !__lock_lint */ 1361 return; 1362 } 1363 if (ptr != NULL) 1364 kmem_cache_free(state->id_req_kmc, ptr); 1365 1366 mutex_enter(&state->id_acache_req_lock); 1367 } else { 1368 /* 1369 * Nothing to do: wait till new request arrives. 1370 */ 1371 #ifndef __lock_lint 1372 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1373 cv_wait(&state->id_acache_req_cv, 1374 &state->id_acache_req_lock); 1375 CALLB_CPR_SAFE_END(&cprinfo, 1376 &state->id_acache_req_lock); 1377 #endif /* !_lock_lint */ 1378 } 1379 } 1380 /*NOTREACHED*/ 1381 _NOTE(NOT_REACHED) 1382 } 1383 1384 /* 1385 * Return when it is safe to queue requests to the async daemon; primarily 1386 * for subnet trap and async event handling. Disallow requests before the 1387 * daemon is created, and when interface deinitilization starts. 1388 */ 1389 static boolean_t 1390 ibd_async_safe(ibd_state_t *state) 1391 { 1392 mutex_enter(&state->id_trap_lock); 1393 if (state->id_trap_stop) { 1394 mutex_exit(&state->id_trap_lock); 1395 return (B_FALSE); 1396 } 1397 state->id_trap_inprog++; 1398 mutex_exit(&state->id_trap_lock); 1399 return (B_TRUE); 1400 } 1401 1402 /* 1403 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1404 * trap or event handling to complete to kill the async thread and deconstruct 1405 * the mcg/ace list. 1406 */ 1407 static void 1408 ibd_async_done(ibd_state_t *state) 1409 { 1410 mutex_enter(&state->id_trap_lock); 1411 if (--state->id_trap_inprog == 0) 1412 cv_signal(&state->id_trap_cv); 1413 mutex_exit(&state->id_trap_lock); 1414 } 1415 1416 /* 1417 * Hash functions: 1418 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1419 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1420 * These operate on mac addresses input into ibd_send, but there is no 1421 * guarantee on the alignment of the ipoib_mac_t structure. 1422 */ 1423 /*ARGSUSED*/ 1424 static uint_t 1425 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1426 { 1427 ulong_t ptraddr = (ulong_t)key; 1428 uint_t hval; 1429 1430 /* 1431 * If the input address is 4 byte aligned, we can just dereference 1432 * it. This is most common, since IP will send in a 4 byte aligned 1433 * IP header, which implies the 24 byte IPoIB psuedo header will be 1434 * 4 byte aligned too. 1435 */ 1436 if ((ptraddr & 3) == 0) 1437 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1438 1439 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1440 return (hval); 1441 } 1442 1443 static int 1444 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1445 { 1446 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1447 return (0); 1448 else 1449 return (1); 1450 } 1451 1452 /* 1453 * Initialize all the per interface caches and lists; AH cache, 1454 * MCG list etc. 1455 */ 1456 static int 1457 ibd_acache_init(ibd_state_t *state) 1458 { 1459 ibd_ace_t *ce; 1460 int i; 1461 1462 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1463 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1464 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1465 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1466 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1467 offsetof(ibd_ace_t, ac_list)); 1468 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1469 offsetof(ibd_ace_t, ac_list)); 1470 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1471 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1472 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1473 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1474 offsetof(ibd_mce_t, mc_list)); 1475 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1476 offsetof(ibd_mce_t, mc_list)); 1477 list_create(&state->id_req_list, sizeof (ibd_req_t), 1478 offsetof(ibd_req_t, rq_list)); 1479 1480 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1481 IBD_NUM_AH, KM_SLEEP); 1482 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1483 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1484 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1485 ibd_acache_fini(state); 1486 return (DDI_FAILURE); 1487 } else { 1488 CLEAR_REFCYCLE(ce); 1489 ce->ac_mce = NULL; 1490 IBD_ACACHE_INSERT_FREE(state, ce); 1491 } 1492 } 1493 return (DDI_SUCCESS); 1494 } 1495 1496 static void 1497 ibd_acache_fini(ibd_state_t *state) 1498 { 1499 ibd_ace_t *ptr; 1500 1501 mutex_enter(&state->id_ac_mutex); 1502 1503 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1504 ASSERT(GET_REF(ptr) == 0); 1505 (void) ibt_free_ud_dest(ptr->ac_dest); 1506 } 1507 1508 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1509 ASSERT(GET_REF(ptr) == 0); 1510 (void) ibt_free_ud_dest(ptr->ac_dest); 1511 } 1512 1513 list_destroy(&state->id_ah_free); 1514 list_destroy(&state->id_ah_active); 1515 list_destroy(&state->id_mc_full); 1516 list_destroy(&state->id_mc_non); 1517 list_destroy(&state->id_req_list); 1518 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1519 mutex_exit(&state->id_ac_mutex); 1520 mutex_destroy(&state->id_ac_mutex); 1521 mutex_destroy(&state->id_mc_mutex); 1522 mutex_destroy(&state->id_acache_req_lock); 1523 cv_destroy(&state->id_acache_req_cv); 1524 } 1525 1526 /* 1527 * Search AH active hash list for a cached path to input destination. 1528 * If we are "just looking", hold == F. When we are in the Tx path, 1529 * we set hold == T to grab a reference on the AH so that it can not 1530 * be recycled to a new destination while the Tx request is posted. 1531 */ 1532 static ibd_ace_t * 1533 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1534 { 1535 ibd_ace_t *ptr; 1536 1537 ASSERT(mutex_owned(&state->id_ac_mutex)); 1538 1539 /* 1540 * Do hash search. 1541 */ 1542 if (mod_hash_find(state->id_ah_active_hash, 1543 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1544 if (hold) 1545 INC_REF(ptr, num); 1546 return (ptr); 1547 } 1548 return (NULL); 1549 } 1550 1551 /* 1552 * This is called by the tx side; if an initialized AH is found in 1553 * the active list, it is locked down and can be used; if no entry 1554 * is found, an async request is queued to do path resolution. 1555 */ 1556 static ibd_ace_t * 1557 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1558 { 1559 ibd_ace_t *ptr; 1560 ibd_req_t *req; 1561 1562 /* 1563 * Only attempt to print when we can; in the mdt pattr case, the 1564 * address is not aligned properly. 1565 */ 1566 if (((ulong_t)mac & 3) == 0) 1567 DPRINT(4, 1568 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1569 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1570 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1571 htonl(mac->ipoib_gidsuff[1])); 1572 1573 mutex_enter(&state->id_ac_mutex); 1574 1575 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1576 mutex_exit(&state->id_ac_mutex); 1577 return (ptr); 1578 } 1579 1580 /* 1581 * Implementation of a single outstanding async request; if 1582 * the operation is not started yet, queue a request and move 1583 * to ongoing state. Remember in id_ah_addr for which address 1584 * we are queueing the request, in case we need to flag an error; 1585 * Any further requests, for the same or different address, until 1586 * the operation completes, is sent back to GLDv3 to be retried. 1587 * The async thread will update id_ah_op with an error indication 1588 * or will set it to indicate the next look up can start; either 1589 * way, it will mac_tx_update() so that all blocked requests come 1590 * back here. 1591 */ 1592 *err = EAGAIN; 1593 if (state->id_ah_op == NOTSTARTED) { 1594 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1595 if (req != NULL) { 1596 /* 1597 * We did not even find the entry; queue a request 1598 * for it. 1599 */ 1600 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1601 ibd_queue_work_slot(state, req, ASYNC_GETAH); 1602 state->id_ah_op = ONGOING; 1603 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1604 } 1605 } else if ((state->id_ah_op != ONGOING) && 1606 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1607 /* 1608 * Check the status of the pathrecord lookup request 1609 * we had queued before. 1610 */ 1611 if (state->id_ah_op == ERRORED) { 1612 *err = EFAULT; 1613 state->id_ah_error++; 1614 } else { 1615 /* 1616 * ROUTERED case: We need to send to the 1617 * all-router MCG. If we can find the AH for 1618 * the mcg, the Tx will be attempted. If we 1619 * do not find the AH, we return NORESOURCES 1620 * to retry. 1621 */ 1622 ipoib_mac_t routermac; 1623 1624 (void) ibd_get_allroutergroup(state, mac, &routermac); 1625 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1626 numwqe); 1627 } 1628 state->id_ah_op = NOTSTARTED; 1629 } else if ((state->id_ah_op != ONGOING) && 1630 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1631 /* 1632 * This case can happen when we get a higher band 1633 * packet. The easiest way is to reset the state machine 1634 * to accommodate the higher priority packet. 1635 */ 1636 state->id_ah_op = NOTSTARTED; 1637 } 1638 mutex_exit(&state->id_ac_mutex); 1639 1640 return (ptr); 1641 } 1642 1643 /* 1644 * Grab a not-currently-in-use AH/PathRecord from the active 1645 * list to recycle to a new destination. Only the async thread 1646 * executes this code. 1647 */ 1648 static ibd_ace_t * 1649 ibd_acache_get_unref(ibd_state_t *state) 1650 { 1651 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1652 1653 ASSERT(mutex_owned(&state->id_ac_mutex)); 1654 1655 /* 1656 * Do plain linear search. 1657 */ 1658 while (ptr != NULL) { 1659 /* 1660 * Note that it is possible that the "cycle" bit 1661 * is set on the AH w/o any reference count. The 1662 * mcg must have been deleted, and the tx cleanup 1663 * just decremented the reference count to 0, but 1664 * hasn't gotten around to grabbing the id_ac_mutex 1665 * to move the AH into the free list. 1666 */ 1667 if (GET_REF(ptr) == 0) { 1668 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1669 break; 1670 } 1671 ptr = list_next(&state->id_ah_active, ptr); 1672 } 1673 return (ptr); 1674 } 1675 1676 /* 1677 * Invoked to clean up AH from active list in case of multicast 1678 * disable and to handle sendonly memberships during mcg traps. 1679 * And for port up processing for multicast and unicast AHs. 1680 * Normally, the AH is taken off the active list, and put into 1681 * the free list to be recycled for a new destination. In case 1682 * Tx requests on the AH have not completed yet, the AH is marked 1683 * for reaping (which will put the AH on the free list) once the Tx's 1684 * complete; in this case, depending on the "force" input, we take 1685 * out the AH from the active list right now, or leave it also for 1686 * the reap operation. Returns TRUE if the AH is taken off the active 1687 * list (and either put into the free list right now, or arranged for 1688 * later), FALSE otherwise. 1689 */ 1690 static boolean_t 1691 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1692 { 1693 ibd_ace_t *acactive; 1694 boolean_t ret = B_TRUE; 1695 1696 ASSERT(mutex_owned(&state->id_ac_mutex)); 1697 1698 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1699 1700 /* 1701 * Note that the AH might already have the cycle bit set 1702 * on it; this might happen if sequences of multicast 1703 * enables and disables are coming so fast, that posted 1704 * Tx's to the mcg have not completed yet, and the cycle 1705 * bit is set successively by each multicast disable. 1706 */ 1707 if (SET_CYCLE_IF_REF(acactive)) { 1708 if (!force) { 1709 /* 1710 * The ace is kept on the active list, further 1711 * Tx's can still grab a reference on it; the 1712 * ace is reaped when all pending Tx's 1713 * referencing the AH complete. 1714 */ 1715 ret = B_FALSE; 1716 } else { 1717 /* 1718 * In the mcg trap case, we always pull the 1719 * AH from the active list. And also the port 1720 * up multi/unicast case. 1721 */ 1722 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1723 acactive->ac_mce = NULL; 1724 } 1725 } else { 1726 /* 1727 * Determined the ref count is 0, thus reclaim 1728 * immediately after pulling out the ace from 1729 * the active list. 1730 */ 1731 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1732 acactive->ac_mce = NULL; 1733 IBD_ACACHE_INSERT_FREE(state, acactive); 1734 } 1735 1736 } 1737 return (ret); 1738 } 1739 1740 /* 1741 * Helper function for async path record lookup. If we are trying to 1742 * Tx to a MCG, check our membership, possibly trying to join the 1743 * group if required. If that fails, try to send the packet to the 1744 * all router group (indicated by the redirect output), pointing 1745 * the input mac address to the router mcg address. 1746 */ 1747 static ibd_mce_t * 1748 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1749 { 1750 ib_gid_t mgid; 1751 ibd_mce_t *mce; 1752 ipoib_mac_t routermac; 1753 1754 *redirect = B_FALSE; 1755 ibd_n2h_gid(mac, &mgid); 1756 1757 /* 1758 * Check the FullMember+SendOnlyNonMember list. 1759 * Since we are the only one who manipulates the 1760 * id_mc_full list, no locks are needed. 1761 */ 1762 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1763 if (mce != NULL) { 1764 DPRINT(4, "ibd_async_mcache : already joined to group"); 1765 return (mce); 1766 } 1767 1768 /* 1769 * Not found; try to join(SendOnlyNonMember) and attach. 1770 */ 1771 DPRINT(4, "ibd_async_mcache : not joined to group"); 1772 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1773 NULL) { 1774 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1775 return (mce); 1776 } 1777 1778 /* 1779 * MCGroup not present; try to join the all-router group. If 1780 * any of the following steps succeed, we will be redirecting 1781 * to the all router group. 1782 */ 1783 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1784 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1785 return (NULL); 1786 *redirect = B_TRUE; 1787 ibd_n2h_gid(&routermac, &mgid); 1788 bcopy(&routermac, mac, IPOIB_ADDRL); 1789 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1790 mgid.gid_prefix, mgid.gid_guid); 1791 1792 /* 1793 * Are we already joined to the router group? 1794 */ 1795 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1796 DPRINT(4, "ibd_async_mcache : using already joined router" 1797 "group\n"); 1798 return (mce); 1799 } 1800 1801 /* 1802 * Can we join(SendOnlyNonMember) the router group? 1803 */ 1804 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1805 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1806 NULL) { 1807 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1808 return (mce); 1809 } 1810 1811 return (NULL); 1812 } 1813 1814 /* 1815 * Async path record lookup code. 1816 */ 1817 static void 1818 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1819 { 1820 ibd_ace_t *ce; 1821 ibd_mce_t *mce = NULL; 1822 ibt_path_attr_t path_attr; 1823 ibt_path_info_t path_info; 1824 ib_gid_t destgid; 1825 int ret = NOTSTARTED; 1826 1827 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1828 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1829 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1830 htonl(mac->ipoib_gidsuff[1])); 1831 1832 /* 1833 * Check whether we are trying to transmit to a MCG. 1834 * In that case, we need to make sure we are a member of 1835 * the MCG. 1836 */ 1837 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1838 boolean_t redirected; 1839 1840 /* 1841 * If we can not find or join the group or even 1842 * redirect, error out. 1843 */ 1844 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1845 NULL) { 1846 state->id_ah_op = ERRORED; 1847 return; 1848 } 1849 1850 /* 1851 * If we got redirected, we need to determine whether 1852 * the AH for the new mcg is in the cache already, and 1853 * not pull it in then; otherwise proceed to get the 1854 * path for the new mcg. There is no guarantee that 1855 * if the AH is currently in the cache, it will still be 1856 * there when we look in ibd_acache_lookup(), but that's 1857 * okay, we will come back here. 1858 */ 1859 if (redirected) { 1860 ret = ROUTERED; 1861 DPRINT(4, "ibd_async_acache : redirected to " 1862 "%08X:%08X:%08X:%08X:%08X", 1863 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1864 htonl(mac->ipoib_gidpref[1]), 1865 htonl(mac->ipoib_gidsuff[0]), 1866 htonl(mac->ipoib_gidsuff[1])); 1867 1868 mutex_enter(&state->id_ac_mutex); 1869 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1870 mutex_exit(&state->id_ac_mutex); 1871 DPRINT(4, "ibd_async_acache : router AH found"); 1872 state->id_ah_op = ROUTERED; 1873 return; 1874 } 1875 mutex_exit(&state->id_ac_mutex); 1876 } 1877 } 1878 1879 /* 1880 * Get an AH from the free list. 1881 */ 1882 mutex_enter(&state->id_ac_mutex); 1883 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1884 /* 1885 * No free ones; try to grab an unreferenced active 1886 * one. Maybe we need to make the active list LRU, 1887 * but that will create more work for Tx callbacks. 1888 * Is there a way of not having to pull out the 1889 * entry from the active list, but just indicate it 1890 * is being recycled? Yes, but that creates one more 1891 * check in the fast lookup path. 1892 */ 1893 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1894 /* 1895 * Pretty serious shortage now. 1896 */ 1897 state->id_ah_op = NOTSTARTED; 1898 mutex_exit(&state->id_ac_mutex); 1899 DPRINT(10, "ibd_async_acache : failed to find AH " 1900 "slot\n"); 1901 return; 1902 } 1903 /* 1904 * We could check whether ac_mce points to a SendOnly 1905 * member and drop that membership now. Or do it lazily 1906 * at detach time. 1907 */ 1908 ce->ac_mce = NULL; 1909 } 1910 mutex_exit(&state->id_ac_mutex); 1911 ASSERT(ce->ac_mce == NULL); 1912 1913 /* 1914 * Update the entry. 1915 */ 1916 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1917 1918 bzero(&path_info, sizeof (path_info)); 1919 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1920 path_attr.pa_sgid = state->id_sgid; 1921 path_attr.pa_num_dgids = 1; 1922 ibd_n2h_gid(&ce->ac_mac, &destgid); 1923 path_attr.pa_dgids = &destgid; 1924 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1925 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1926 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1927 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1928 goto error; 1929 } 1930 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1931 ntohl(ce->ac_mac.ipoib_qpn), 1932 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1933 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1934 goto error; 1935 } 1936 1937 /* 1938 * mce is set whenever an AH is being associated with a 1939 * MCG; this will come in handy when we leave the MCG. The 1940 * lock protects Tx fastpath from scanning the active list. 1941 */ 1942 if (mce != NULL) 1943 ce->ac_mce = mce; 1944 mutex_enter(&state->id_ac_mutex); 1945 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1946 state->id_ah_op = ret; 1947 mutex_exit(&state->id_ac_mutex); 1948 return; 1949 error: 1950 /* 1951 * We might want to drop SendOnly membership here if we 1952 * joined above. The lock protects Tx callbacks inserting 1953 * into the free list. 1954 */ 1955 mutex_enter(&state->id_ac_mutex); 1956 state->id_ah_op = ERRORED; 1957 IBD_ACACHE_INSERT_FREE(state, ce); 1958 mutex_exit(&state->id_ac_mutex); 1959 } 1960 1961 /* 1962 * While restoring port's presence on the subnet on a port up, it is possible 1963 * that the port goes down again. 1964 */ 1965 static void 1966 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1967 { 1968 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1969 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1970 LINK_STATE_UP; 1971 ibd_mce_t *mce, *pmce; 1972 ibd_ace_t *ace, *pace; 1973 1974 DPRINT(10, "ibd_async_link(): %d", opcode); 1975 1976 /* 1977 * On a link up, revalidate the link speed/width. No point doing 1978 * this on a link down, since we will be unable to do SA operations, 1979 * defaulting to the lowest speed. Also notice that we update our 1980 * notion of speed before calling mac_link_update(), which will do 1981 * neccesary higher level notifications for speed changes. 1982 */ 1983 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1984 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1985 state->id_link_speed = ibd_get_portspeed(state); 1986 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1987 } 1988 1989 /* 1990 * Do all the work required to establish our presence on 1991 * the subnet. 1992 */ 1993 if (opcode == IBD_LINK_UP_ABSENT) { 1994 /* 1995 * If in promiscuous mode ... 1996 */ 1997 if (state->id_prom_op == COMPLETED) { 1998 /* 1999 * Drop all nonmembership. 2000 */ 2001 ibd_async_unsetprom(state); 2002 2003 /* 2004 * Then, try to regain nonmembership to all mcg's. 2005 */ 2006 ibd_async_setprom(state); 2007 2008 } 2009 2010 /* 2011 * Drop all sendonly membership (which also gets rid of the 2012 * AHs); try to reacquire all full membership. 2013 */ 2014 mce = list_head(&state->id_mc_full); 2015 while ((pmce = mce) != NULL) { 2016 mce = list_next(&state->id_mc_full, mce); 2017 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 2018 ibd_leave_group(state, 2019 pmce->mc_info.mc_adds_vect.av_dgid, 2020 IB_MC_JSTATE_SEND_ONLY_NON); 2021 else 2022 ibd_reacquire_group(state, pmce); 2023 } 2024 2025 /* 2026 * Recycle all active AHs to free list (and if there are 2027 * pending posts, make sure they will go into the free list 2028 * once the Tx's complete). Grab the lock to prevent 2029 * concurrent Tx's as well as Tx cleanups. 2030 */ 2031 mutex_enter(&state->id_ac_mutex); 2032 ace = list_head(&state->id_ah_active); 2033 while ((pace = ace) != NULL) { 2034 boolean_t cycled; 2035 2036 ace = list_next(&state->id_ah_active, ace); 2037 mce = pace->ac_mce; 2038 cycled = ibd_acache_recycle(state, &pace->ac_mac, 2039 B_TRUE); 2040 /* 2041 * If this is for an mcg, it must be for a fullmember, 2042 * since we got rid of send-only members above when 2043 * processing the mce list. 2044 */ 2045 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2046 IB_MC_JSTATE_FULL))); 2047 2048 /* 2049 * Check if the fullmember mce needs to be torn down, 2050 * ie whether the DLPI disable has already been done. 2051 * If so, do some of the work of tx_cleanup, namely 2052 * causing leave (which will fail), detach and 2053 * mce-freeing. tx_cleanup will put the AH into free 2054 * list. The reason to duplicate some of this 2055 * tx_cleanup work is because we want to delete the 2056 * AH right now instead of waiting for tx_cleanup, to 2057 * force subsequent Tx's to reacquire an AH. 2058 */ 2059 if ((mce != NULL) && (mce->mc_fullreap)) 2060 ibd_async_reap_group(state, mce, 2061 mce->mc_info.mc_adds_vect.av_dgid, 2062 mce->mc_jstate); 2063 } 2064 mutex_exit(&state->id_ac_mutex); 2065 } 2066 2067 /* 2068 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2069 * (which stops further events from being delivered) before 2070 * mac_unreigster(). At this point, it is guaranteed that mac_register 2071 * has already been done. 2072 */ 2073 mutex_enter(&state->id_link_mutex); 2074 state->id_link_state = lstate; 2075 mac_link_update(state->id_mh, lstate); 2076 mutex_exit(&state->id_link_mutex); 2077 2078 ibd_async_done(state); 2079 } 2080 2081 /* 2082 * When the link is notified up, we need to do a few things, based 2083 * on the port's current p_init_type_reply claiming a reinit has been 2084 * done or not. The reinit steps are: 2085 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2086 * the old Pkey and GID0 are correct. 2087 * 2. Register for mcg traps (already done by ibmf). 2088 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2089 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2090 * 4. Give up all sendonly memberships. 2091 * 5. Acquire all full memberships. 2092 * 6. In promiscuous mode, acquire all non memberships. 2093 * 7. Recycle all AHs to free list. 2094 */ 2095 static void 2096 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2097 { 2098 ibt_hca_portinfo_t *port_infop; 2099 ibt_status_t ibt_status; 2100 uint_t psize, port_infosz; 2101 ibd_link_op_t opcode; 2102 ibd_req_t *req; 2103 2104 /* 2105 * Do not send a request to the async daemon if it has not 2106 * yet been created or is being destroyed. If the async 2107 * daemon has not yet been created, we still need to track 2108 * last known state of the link. If this code races with the 2109 * detach path, then we are assured that the detach path has 2110 * not yet done the ibt_close_hca (which waits for all async 2111 * events to complete). If the code races with the attach path, 2112 * we need to validate the pkey/gid (in the link_up case) if 2113 * the initialization path has already set these up and created 2114 * IBTF resources based on the values. 2115 */ 2116 mutex_enter(&state->id_link_mutex); 2117 2118 /* 2119 * If the init code in ibd_drv_init hasn't yet set up the 2120 * pkey/gid, nothing to do; that code will set the link state. 2121 */ 2122 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2123 mutex_exit(&state->id_link_mutex); 2124 return; 2125 } 2126 2127 if (code == IBT_EVENT_PORT_UP) { 2128 uint8_t itreply; 2129 boolean_t badup = B_FALSE; 2130 2131 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2132 state->id_port, &port_infop, &psize, &port_infosz); 2133 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2134 mutex_exit(&state->id_link_mutex); 2135 DPRINT(10, "ibd_link_up : failed in" 2136 " ibt_query_port()\n"); 2137 return; 2138 } 2139 2140 /* 2141 * If the link already went down by the time the handler gets 2142 * here, give up; we can not even validate pkey/gid since those 2143 * are not valid. 2144 */ 2145 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2146 badup = B_TRUE; 2147 2148 itreply = port_infop->p_init_type_reply; 2149 2150 /* 2151 * In InitTypeReply, check if NoLoadReply == 2152 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2153 */ 2154 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2155 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2156 (!badup)) { 2157 /* 2158 * Check that the subnet part of GID0 has not changed. 2159 */ 2160 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2161 sizeof (ib_gid_t)) != 0) 2162 badup = B_TRUE; 2163 2164 /* 2165 * Check that Pkey/index mapping is still valid. 2166 */ 2167 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2168 (port_infop->p_pkey_tbl[state->id_pkix] != 2169 state->id_pkey)) 2170 badup = B_TRUE; 2171 } 2172 2173 /* 2174 * In InitTypeReply, if PreservePresenceReply indicates the SM 2175 * has ensured that the port's presence in mcg, traps etc is 2176 * intact, nothing more to do. 2177 */ 2178 opcode = IBD_LINK_UP_ABSENT; 2179 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2180 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2181 opcode = IBD_LINK_UP; 2182 2183 if (badup) 2184 code = IBT_ERROR_PORT_DOWN; 2185 ibt_free_portinfo(port_infop, port_infosz); 2186 } 2187 2188 if (!ibd_async_safe(state)) { 2189 state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? 2190 LINK_STATE_UP : LINK_STATE_DOWN); 2191 mutex_exit(&state->id_link_mutex); 2192 return; 2193 } 2194 mutex_exit(&state->id_link_mutex); 2195 2196 if (code == IBT_ERROR_PORT_DOWN) 2197 opcode = IBD_LINK_DOWN; 2198 2199 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2200 req->rq_ptr = (void *)opcode; 2201 ibd_queue_work_slot(state, req, ASYNC_LINK); 2202 } 2203 2204 /* 2205 * For the port up/down events, IBTL guarantees there will not be concurrent 2206 * invocations of the handler. IBTL might coalesce link transition events, 2207 * and not invoke the handler for _each_ up/down transition, but it will 2208 * invoke the handler with last known state 2209 */ 2210 static void 2211 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2212 ibt_async_code_t code, ibt_async_event_t *event) 2213 { 2214 ibd_state_t *state = (ibd_state_t *)clnt_private; 2215 2216 switch (code) { 2217 case IBT_ERROR_CATASTROPHIC_CHAN: 2218 ibd_print_warn(state, "catastrophic channel error"); 2219 break; 2220 case IBT_ERROR_CQ: 2221 ibd_print_warn(state, "completion queue error"); 2222 break; 2223 case IBT_ERROR_PORT_DOWN: 2224 case IBT_EVENT_PORT_UP: 2225 /* 2226 * Events will be delivered to all instances that have 2227 * done ibt_open_hca() but not yet done ibt_close_hca(). 2228 * Only need to do work for our port; IBTF will deliver 2229 * events for other ports on the hca we have ibt_open_hca'ed 2230 * too. Note that ibd_drv_init() initializes id_port before 2231 * doing ibt_open_hca(). 2232 */ 2233 ASSERT(state->id_hca_hdl == hca_hdl); 2234 if (state->id_port != event->ev_port) 2235 break; 2236 2237 ibd_link_mod(state, code); 2238 break; 2239 2240 case IBT_HCA_ATTACH_EVENT: 2241 case IBT_HCA_DETACH_EVENT: 2242 /* 2243 * When a new card is plugged to the system, attach_event is 2244 * invoked. Additionally, a cfgadm needs to be run to make the 2245 * card known to the system, and an ifconfig needs to be run to 2246 * plumb up any ibd interfaces on the card. In the case of card 2247 * unplug, a cfgadm is run that will trigger any RCM scripts to 2248 * unplumb the ibd interfaces on the card; when the card is 2249 * actually unplugged, the detach_event is invoked; 2250 * additionally, if any ibd instances are still active on the 2251 * card (eg there were no associated RCM scripts), driver's 2252 * detach routine is invoked. 2253 */ 2254 break; 2255 default: 2256 break; 2257 } 2258 } 2259 2260 /* 2261 * Attach device to the IO framework. 2262 */ 2263 static int 2264 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2265 { 2266 mac_register_t *macp; 2267 ibd_state_t *state; 2268 int instance; 2269 int err; 2270 2271 switch (cmd) { 2272 case DDI_ATTACH: 2273 break; 2274 case DDI_RESUME: 2275 /* This driver does not support resume */ 2276 default: 2277 return (DDI_FAILURE); 2278 } 2279 2280 /* 2281 * Allocate soft device data structure 2282 */ 2283 instance = ddi_get_instance(dip); 2284 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2285 return (DDI_FAILURE); 2286 state = ddi_get_soft_state(ibd_list, instance); 2287 2288 /* pre ibt_attach() soft state initialization */ 2289 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2290 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2291 goto attach_fail_state_init; 2292 } 2293 2294 /* alloc rx soft intr */ 2295 if ((ibd_rx_softintr == 1) && 2296 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2297 NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { 2298 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2299 goto attach_fail_ddi_add_rx_softintr; 2300 } 2301 2302 /* alloc tx soft intr */ 2303 if ((ibd_tx_softintr == 1) && 2304 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2305 NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { 2306 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2307 goto attach_fail_ddi_add_tx_softintr; 2308 } 2309 2310 /* "attach" to IBTL */ 2311 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2312 &state->id_ibt_hdl) != IBT_SUCCESS) { 2313 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2314 goto attach_fail_ibt_attach; 2315 } 2316 2317 /* Finish initializing this driver */ 2318 if (ibd_drv_init(state) != DDI_SUCCESS) { 2319 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2320 goto attach_fail_drv_init; 2321 } 2322 2323 /* 2324 * Initialize pointers to device specific functions which will be 2325 * used by the generic layer. 2326 */ 2327 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2328 DPRINT(10, "ibd_attach : failed in mac_alloc()"); 2329 goto attach_fail_drv_init; 2330 } 2331 2332 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2333 macp->m_driver = state; 2334 macp->m_dip = state->id_dip; 2335 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2336 macp->m_callbacks = &ib_m_callbacks; 2337 macp->m_min_sdu = 0; 2338 macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; 2339 2340 /* 2341 * Register ourselves with the GLDv3 interface 2342 */ 2343 err = mac_register(macp, &state->id_mh); 2344 mac_free(macp); 2345 if (err != 0) { 2346 DPRINT(10, "ibd_attach : failed in mac_register()"); 2347 goto attach_fail_mac_register; 2348 } 2349 2350 /* 2351 * Setup the handler we will use for regular DLPI stuff. Its important 2352 * to setup the recv handler after registering with gldv3. 2353 */ 2354 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2355 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2356 IBT_SUCCESS) { 2357 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2358 goto attach_fail_setup_handler; 2359 } 2360 2361 /* 2362 * Setup the subnet notices handler after we initialize the a/mcaches 2363 * and start the async thread, both of which are required for the 2364 * trap handler to function properly. Enable the trap handler to 2365 * queue requests to the async thread after the mac_register, because 2366 * the async daemon invokes mac_tx_update(), which must be done after 2367 * mac_register(). 2368 */ 2369 ibt_register_subnet_notices(state->id_ibt_hdl, 2370 ibd_snet_notices_handler, state); 2371 mutex_enter(&state->id_trap_lock); 2372 state->id_trap_stop = B_FALSE; 2373 mutex_exit(&state->id_trap_lock); 2374 2375 /* 2376 * Indicate link status to GLDv3 and higher layers. By default, 2377 * we assume we are in up state (which must have been true at 2378 * least at the time the broadcast mcg's were probed); if there 2379 * were any up/down transitions till the time we come here, the 2380 * async handler will have updated last known state, which we 2381 * use to tell GLDv3. The async handler will not send any 2382 * notifications to GLDv3 till we reach here in the initialization 2383 * sequence. 2384 */ 2385 mac_link_update(state->id_mh, state->id_link_state); 2386 2387 return (DDI_SUCCESS); 2388 2389 /* Attach failure points, cleanup */ 2390 attach_fail_setup_handler: 2391 (void) mac_unregister(state->id_mh); 2392 2393 attach_fail_mac_register: 2394 ibd_drv_fini(state); 2395 2396 attach_fail_drv_init: 2397 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2398 ibd_print_warn(state, "failed to free IB resources"); 2399 2400 attach_fail_ibt_attach: 2401 if (ibd_tx_softintr == 1) 2402 ddi_remove_softintr(state->id_tx); 2403 2404 attach_fail_ddi_add_tx_softintr: 2405 if (ibd_rx_softintr == 1) 2406 ddi_remove_softintr(state->id_rx); 2407 2408 attach_fail_ddi_add_rx_softintr: 2409 ibd_state_fini(state); 2410 2411 attach_fail_state_init: 2412 ddi_soft_state_free(ibd_list, instance); 2413 2414 return (DDI_FAILURE); 2415 } 2416 2417 /* 2418 * Detach device from the IO framework. 2419 */ 2420 static int 2421 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2422 { 2423 ibd_state_t *state; 2424 int status; 2425 int instance; 2426 2427 switch (cmd) { 2428 case DDI_DETACH: 2429 break; 2430 case DDI_SUSPEND: 2431 default: 2432 return (DDI_FAILURE); 2433 } 2434 2435 instance = ddi_get_instance(dip); 2436 state = ddi_get_soft_state(ibd_list, instance); 2437 2438 /* 2439 * First, stop receive interrupts; this stops the 2440 * driver from handing up buffers to higher layers. 2441 * Wait for receive buffers to be returned; give up 2442 * after 5 seconds. 2443 */ 2444 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2445 status = 50; 2446 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2447 delay(drv_usectohz(100000)); 2448 if (--status == 0) { 2449 DPRINT(2, "ibd_detach : reclaiming failed"); 2450 goto failed; 2451 } 2452 } 2453 2454 if (mac_unregister(state->id_mh) != DDI_SUCCESS) { 2455 DPRINT(10, "ibd_detach : failed in mac_unregister()"); 2456 goto failed; 2457 } 2458 2459 if (ibd_rx_softintr == 1) 2460 ddi_remove_softintr(state->id_rx); 2461 2462 if (ibd_tx_softintr == 1) 2463 ddi_remove_softintr(state->id_tx); 2464 2465 ibd_drv_fini(state); 2466 2467 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2468 ibd_print_warn(state, "failed to free all IB resources at " 2469 "driver detach time"); 2470 2471 ibd_state_fini(state); 2472 ddi_soft_state_free(ibd_list, instance); 2473 return (DDI_SUCCESS); 2474 2475 failed: 2476 /* 2477 * Reap all the Tx/Rx completions that were posted since we 2478 * turned off the notification. Turn on notifications. There 2479 * is a race in that we do not reap completions that come in 2480 * after the poll and before notifications get turned on. That 2481 * is okay, the next rx/tx packet will trigger a completion 2482 * that will reap any missed completions. 2483 */ 2484 ibd_poll_compq(state, state->id_rcq_hdl); 2485 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2486 return (DDI_FAILURE); 2487 } 2488 2489 /* 2490 * Pre ibt_attach() driver initialization 2491 */ 2492 static int 2493 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2494 { 2495 char buf[64]; 2496 2497 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2498 state->id_link_state = LINK_STATE_UNKNOWN; 2499 2500 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2501 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2502 state->id_trap_stop = B_TRUE; 2503 state->id_trap_inprog = 0; 2504 2505 mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); 2506 state->id_dip = dip; 2507 2508 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2509 2510 state->id_tx_list.dl_head = NULL; 2511 state->id_tx_list.dl_tail = NULL; 2512 state->id_tx_list.dl_pending_sends = B_FALSE; 2513 state->id_tx_list.dl_cnt = 0; 2514 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2515 2516 state->id_rx_list.dl_head = NULL; 2517 state->id_rx_list.dl_tail = NULL; 2518 state->id_rx_list.dl_bufs_outstanding = 0; 2519 state->id_rx_list.dl_cnt = 0; 2520 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2521 mutex_init(&state->id_rx_mutex, NULL, MUTEX_DRIVER, NULL); 2522 2523 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2524 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2525 0, NULL, NULL, NULL, NULL, NULL, 0); 2526 2527 return (DDI_SUCCESS); 2528 } 2529 2530 /* 2531 * Post ibt_detach() driver deconstruction 2532 */ 2533 static void 2534 ibd_state_fini(ibd_state_t *state) 2535 { 2536 mutex_destroy(&state->id_tx_list.dl_mutex); 2537 mutex_destroy(&state->id_rx_list.dl_mutex); 2538 mutex_destroy(&state->id_rx_mutex); 2539 mutex_destroy(&state->id_sched_lock); 2540 mutex_destroy(&state->id_txcomp_lock); 2541 2542 cv_destroy(&state->id_trap_cv); 2543 mutex_destroy(&state->id_trap_lock); 2544 mutex_destroy(&state->id_link_mutex); 2545 kmem_cache_destroy(state->id_req_kmc); 2546 } 2547 2548 /* 2549 * Fetch IBA parameters for the network device from IB nexus. 2550 */ 2551 static int 2552 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2553 { 2554 /* 2555 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2556 * Note that the default partition is also allowed. 2557 */ 2558 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2559 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2560 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2561 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2562 "partition\n"); 2563 return (DDI_FAILURE); 2564 } 2565 2566 /* 2567 * ... the IBA port ... 2568 */ 2569 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2570 0, "port-number", 0); 2571 if (state->id_port == 0) { 2572 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2573 return (DDI_FAILURE); 2574 } 2575 2576 /* 2577 * ... and HCA GUID. 2578 */ 2579 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2580 0, "hca-guid", 0); 2581 if (*hca_guid == 0) { 2582 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2583 "guid\n"); 2584 return (DDI_FAILURE); 2585 } 2586 2587 return (DDI_SUCCESS); 2588 } 2589 2590 /* 2591 * Fetch link speed from SA for snmp ifspeed reporting. 2592 */ 2593 static uint64_t 2594 ibd_get_portspeed(ibd_state_t *state) 2595 { 2596 int ret; 2597 ibt_path_info_t path; 2598 ibt_path_attr_t path_attr; 2599 uint8_t num_paths; 2600 uint64_t ifspeed; 2601 2602 /* 2603 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2604 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2605 * 2000000000. Start with that as default. 2606 */ 2607 ifspeed = 2000000000; 2608 2609 bzero(&path_attr, sizeof (path_attr)); 2610 2611 /* 2612 * Get the port speed from Loopback path information. 2613 */ 2614 path_attr.pa_dgids = &state->id_sgid; 2615 path_attr.pa_num_dgids = 1; 2616 path_attr.pa_sgid = state->id_sgid; 2617 2618 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2619 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2620 goto earlydone; 2621 2622 if (num_paths < 1) 2623 goto earlydone; 2624 2625 /* 2626 * In case SA does not return an expected value, report the default 2627 * speed as 1X. 2628 */ 2629 ret = 1; 2630 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2631 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2632 ret = 1; 2633 break; 2634 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2635 ret = 4; 2636 break; 2637 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2638 ret = 12; 2639 break; 2640 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2641 ret = 2; 2642 break; 2643 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2644 ret = 8; 2645 break; 2646 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2647 ret = 16; 2648 break; 2649 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2650 ret = 24; 2651 break; 2652 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2653 ret = 32; 2654 break; 2655 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2656 ret = 48; 2657 break; 2658 } 2659 2660 ifspeed *= ret; 2661 2662 earlydone: 2663 return (ifspeed); 2664 } 2665 2666 /* 2667 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2668 * representing the input mcg mgid. 2669 */ 2670 static ibd_mce_t * 2671 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2672 { 2673 ibd_mce_t *ptr = list_head(mlist); 2674 2675 /* 2676 * Do plain linear search. 2677 */ 2678 while (ptr != NULL) { 2679 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2680 sizeof (ib_gid_t)) == 0) 2681 return (ptr); 2682 ptr = list_next(mlist, ptr); 2683 } 2684 return (NULL); 2685 } 2686 2687 /* 2688 * Execute IBA JOIN. 2689 */ 2690 static ibt_status_t 2691 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2692 { 2693 ibt_mcg_attr_t mcg_attr; 2694 2695 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2696 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2697 mcg_attr.mc_mgid = mgid; 2698 mcg_attr.mc_join_state = mce->mc_jstate; 2699 mcg_attr.mc_scope = state->id_scope; 2700 mcg_attr.mc_pkey = state->id_pkey; 2701 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2702 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2703 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2704 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2705 NULL, NULL)); 2706 } 2707 2708 /* 2709 * This code JOINs the port in the proper way (depending on the join 2710 * state) so that IBA fabric will forward mcg packets to/from the port. 2711 * It also attaches the QPN to the mcg so it can receive those mcg 2712 * packets. This code makes sure not to attach the mcg to the QP if 2713 * that has been previously done due to the mcg being joined with a 2714 * different join state, even though this is not required by SWG_0216, 2715 * refid 3610. 2716 */ 2717 static ibd_mce_t * 2718 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2719 { 2720 ibt_status_t ibt_status; 2721 ibd_mce_t *mce, *tmce, *omce = NULL; 2722 boolean_t do_attach = B_TRUE; 2723 2724 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2725 jstate, mgid.gid_prefix, mgid.gid_guid); 2726 2727 /* 2728 * For enable_multicast Full member joins, we need to do some 2729 * extra work. If there is already an mce on the list that 2730 * indicates full membership, that means the membership has 2731 * not yet been dropped (since the disable_multicast was issued) 2732 * because there are pending Tx's to the mcg; in that case, just 2733 * mark the mce not to be reaped when the Tx completion queues 2734 * an async reap operation. 2735 * 2736 * If there is already an mce on the list indicating sendonly 2737 * membership, try to promote to full membership. Be careful 2738 * not to deallocate the old mce, since there might be an AH 2739 * pointing to it; instead, update the old mce with new data 2740 * that tracks the full membership. 2741 */ 2742 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2743 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2744 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2745 ASSERT(omce->mc_fullreap); 2746 omce->mc_fullreap = B_FALSE; 2747 return (omce); 2748 } else { 2749 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2750 } 2751 } 2752 2753 /* 2754 * Allocate the ibd_mce_t to track this JOIN. 2755 */ 2756 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2757 mce->mc_fullreap = B_FALSE; 2758 mce->mc_jstate = jstate; 2759 2760 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2761 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2762 ibt_status); 2763 kmem_free(mce, sizeof (ibd_mce_t)); 2764 return (NULL); 2765 } 2766 2767 /* 2768 * Is an IBA attach required? Not if the interface is already joined 2769 * to the mcg in a different appropriate join state. 2770 */ 2771 if (jstate == IB_MC_JSTATE_NON) { 2772 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2773 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2774 do_attach = B_FALSE; 2775 } else if (jstate == IB_MC_JSTATE_FULL) { 2776 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2777 do_attach = B_FALSE; 2778 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2779 do_attach = B_FALSE; 2780 } 2781 2782 if (do_attach) { 2783 /* 2784 * Do the IBA attach. 2785 */ 2786 DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); 2787 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2788 &mce->mc_info)) != IBT_SUCCESS) { 2789 DPRINT(10, "ibd_join_group : failed qp attachment " 2790 "%d\n", ibt_status); 2791 /* 2792 * NOTE that we should probably preserve the join info 2793 * in the list and later try to leave again at detach 2794 * time. 2795 */ 2796 (void) ibt_leave_mcg(state->id_sgid, mgid, 2797 state->id_sgid, jstate); 2798 kmem_free(mce, sizeof (ibd_mce_t)); 2799 return (NULL); 2800 } 2801 } 2802 2803 /* 2804 * Insert the ibd_mce_t in the proper list. 2805 */ 2806 if (jstate == IB_MC_JSTATE_NON) { 2807 IBD_MCACHE_INSERT_NON(state, mce); 2808 } else { 2809 /* 2810 * Set up the mc_req fields used for reaping the 2811 * mcg in case of delayed tx completion (see 2812 * ibd_tx_cleanup()). Also done for sendonly join in 2813 * case we are promoted to fullmembership later and 2814 * keep using the same mce. 2815 */ 2816 mce->mc_req.rq_gid = mgid; 2817 mce->mc_req.rq_ptr = mce; 2818 /* 2819 * Check whether this is the case of trying to join 2820 * full member, and we were already joined send only. 2821 * We try to drop our SendOnly membership, but it is 2822 * possible that the mcg does not exist anymore (and 2823 * the subnet trap never reached us), so the leave 2824 * operation might fail. 2825 */ 2826 if (omce != NULL) { 2827 (void) ibt_leave_mcg(state->id_sgid, mgid, 2828 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2829 omce->mc_jstate = IB_MC_JSTATE_FULL; 2830 bcopy(&mce->mc_info, &omce->mc_info, 2831 sizeof (ibt_mcg_info_t)); 2832 kmem_free(mce, sizeof (ibd_mce_t)); 2833 return (omce); 2834 } 2835 mutex_enter(&state->id_mc_mutex); 2836 IBD_MCACHE_INSERT_FULL(state, mce); 2837 mutex_exit(&state->id_mc_mutex); 2838 } 2839 2840 return (mce); 2841 } 2842 2843 /* 2844 * Called during port up event handling to attempt to reacquire full 2845 * membership to an mcg. Stripped down version of ibd_join_group(). 2846 * Note that it is possible that the mcg might have gone away, and 2847 * gets recreated at this point. 2848 */ 2849 static void 2850 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2851 { 2852 ib_gid_t mgid; 2853 2854 /* 2855 * If the mc_fullreap flag is set, or this join fails, a subsequent 2856 * reap/leave is going to try to leave the group. We could prevent 2857 * that by adding a boolean flag into ibd_mce_t, if required. 2858 */ 2859 if (mce->mc_fullreap) 2860 return; 2861 2862 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2863 2864 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2865 mgid.gid_guid); 2866 2867 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2868 ibd_print_warn(state, "Failure on port up to rejoin " 2869 "multicast gid %016llx:%016llx", 2870 (u_longlong_t)mgid.gid_prefix, 2871 (u_longlong_t)mgid.gid_guid); 2872 } 2873 2874 /* 2875 * This code handles delayed Tx completion cleanups for mcg's to which 2876 * disable_multicast has been issued, regular mcg related cleanups during 2877 * disable_multicast, disable_promiscous and mcg traps, as well as 2878 * cleanups during driver detach time. Depending on the join state, 2879 * it deletes the mce from the appropriate list and issues the IBA 2880 * leave/detach; except in the disable_multicast case when the mce 2881 * is left on the active list for a subsequent Tx completion cleanup. 2882 */ 2883 static void 2884 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2885 uint8_t jstate) 2886 { 2887 ibd_mce_t *tmce; 2888 boolean_t do_detach = B_TRUE; 2889 2890 /* 2891 * Before detaching, we must check whether the other list 2892 * contains the mcg; if we detach blindly, the consumer 2893 * who set up the other list will also stop receiving 2894 * traffic. 2895 */ 2896 if (jstate == IB_MC_JSTATE_FULL) { 2897 /* 2898 * The following check is only relevant while coming 2899 * from the Tx completion path in the reap case. 2900 */ 2901 if (!mce->mc_fullreap) 2902 return; 2903 mutex_enter(&state->id_mc_mutex); 2904 IBD_MCACHE_PULLOUT_FULL(state, mce); 2905 mutex_exit(&state->id_mc_mutex); 2906 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2907 do_detach = B_FALSE; 2908 } else if (jstate == IB_MC_JSTATE_NON) { 2909 IBD_MCACHE_PULLOUT_NON(state, mce); 2910 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2911 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2912 do_detach = B_FALSE; 2913 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2914 mutex_enter(&state->id_mc_mutex); 2915 IBD_MCACHE_PULLOUT_FULL(state, mce); 2916 mutex_exit(&state->id_mc_mutex); 2917 do_detach = B_FALSE; 2918 } 2919 2920 /* 2921 * If we are reacting to a mcg trap and leaving our sendonly or 2922 * non membership, the mcg is possibly already gone, so attempting 2923 * to leave might fail. On the other hand, we must try to leave 2924 * anyway, since this might be a trap from long ago, and we could 2925 * have potentially sendonly joined to a recent incarnation of 2926 * the mcg and are about to loose track of this information. 2927 */ 2928 if (do_detach) { 2929 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2930 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2931 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2932 } 2933 2934 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2935 kmem_free(mce, sizeof (ibd_mce_t)); 2936 } 2937 2938 /* 2939 * Async code executed due to multicast and promiscuous disable requests 2940 * and mcg trap handling; also executed during driver detach. Mostly, a 2941 * leave and detach is done; except for the fullmember case when Tx 2942 * requests are pending, whence arrangements are made for subsequent 2943 * cleanup on Tx completion. 2944 */ 2945 static void 2946 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2947 { 2948 ipoib_mac_t mcmac; 2949 boolean_t recycled; 2950 ibd_mce_t *mce; 2951 2952 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 2953 jstate, mgid.gid_prefix, mgid.gid_guid); 2954 2955 if (jstate == IB_MC_JSTATE_NON) { 2956 recycled = B_TRUE; 2957 mce = IBD_MCACHE_FIND_NON(state, mgid); 2958 /* 2959 * In case we are handling a mcg trap, we might not find 2960 * the mcg in the non list. 2961 */ 2962 if (mce == NULL) 2963 return; 2964 } else { 2965 mce = IBD_MCACHE_FIND_FULL(state, mgid); 2966 2967 /* 2968 * In case we are handling a mcg trap, make sure the trap 2969 * is not arriving late; if we have an mce that indicates 2970 * that we are already a fullmember, that would be a clear 2971 * indication that the trap arrived late (ie, is for a 2972 * previous incarnation of the mcg). 2973 */ 2974 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 2975 if ((mce == NULL) || (mce->mc_jstate == 2976 IB_MC_JSTATE_FULL)) 2977 return; 2978 ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2979 } else { 2980 ASSERT(jstate == IB_MC_JSTATE_FULL); 2981 2982 /* 2983 * If join group failed, mce will be NULL here. 2984 * This is because in GLDv3 driver, set multicast 2985 * will always return success. 2986 */ 2987 if (mce == NULL) 2988 return; 2989 ASSERT(mce->mc_jstate == IB_MC_JSTATE_FULL); 2990 mce->mc_fullreap = B_TRUE; 2991 } 2992 2993 /* 2994 * If no pending Tx's remain that reference the AH 2995 * for the mcg, recycle it from active to free list. 2996 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 2997 * so the last completing Tx will cause an async reap 2998 * operation to be invoked, at which time we will drop our 2999 * membership to the mcg so that the pending Tx's complete 3000 * successfully. Refer to comments on "AH and MCE active 3001 * list manipulation" at top of this file. The lock protects 3002 * against Tx fast path and Tx cleanup code. 3003 */ 3004 mutex_enter(&state->id_ac_mutex); 3005 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3006 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3007 IB_MC_JSTATE_SEND_ONLY_NON)); 3008 mutex_exit(&state->id_ac_mutex); 3009 } 3010 3011 if (recycled) { 3012 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3013 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3014 ibd_async_reap_group(state, mce, mgid, jstate); 3015 } 3016 } 3017 3018 /* 3019 * Find the broadcast address as defined by IPoIB; implicitly 3020 * determines the IBA scope, mtu, tclass etc of the link the 3021 * interface is going to be a member of. 3022 */ 3023 static ibt_status_t 3024 ibd_find_bgroup(ibd_state_t *state) 3025 { 3026 ibt_mcg_attr_t mcg_attr; 3027 uint_t numg; 3028 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3029 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3030 IB_MC_SCOPE_GLOBAL }; 3031 int i, mcgmtu; 3032 boolean_t found = B_FALSE; 3033 3034 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3035 mcg_attr.mc_pkey = state->id_pkey; 3036 state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; 3037 3038 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3039 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3040 3041 /* 3042 * Look for the IPoIB broadcast group. 3043 */ 3044 state->id_mgid.gid_prefix = 3045 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3046 ((uint64_t)state->id_scope << 48) | 3047 ((uint32_t)(state->id_pkey << 16))); 3048 mcg_attr.mc_mgid = state->id_mgid; 3049 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3050 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3051 found = B_TRUE; 3052 break; 3053 } 3054 3055 } 3056 3057 if (!found) { 3058 ibd_print_warn(state, "IPoIB broadcast group absent"); 3059 return (IBT_FAILURE); 3060 } 3061 3062 /* 3063 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3064 */ 3065 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3066 if (state->id_mtu < mcgmtu) { 3067 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3068 "greater than port's maximum MTU %d", mcgmtu, 3069 state->id_mtu); 3070 return (IBT_FAILURE); 3071 } 3072 state->id_mtu = mcgmtu; 3073 3074 return (IBT_SUCCESS); 3075 } 3076 3077 /* 3078 * Post ibt_attach() initialization. 3079 */ 3080 static int 3081 ibd_drv_init(ibd_state_t *state) 3082 { 3083 kthread_t *kht; 3084 ibt_ud_chan_alloc_args_t ud_alloc_attr; 3085 ibt_ud_chan_query_attr_t ud_chan_attr; 3086 ibt_hca_portinfo_t *port_infop; 3087 ibt_hca_attr_t hca_attrs; 3088 ibt_status_t ibt_status; 3089 ibt_cq_attr_t cq_attr; 3090 ib_guid_t hca_guid; 3091 uint32_t real_size; 3092 uint32_t *ptr; 3093 char pathname[OBP_MAXPATHLEN]; 3094 uint_t psize, port_infosz; 3095 3096 /* 3097 * Initialize id_port before ibt_open_hca because of 3098 * ordering requirements in port up/down handling. 3099 */ 3100 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 3101 return (DDI_FAILURE); 3102 3103 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 3104 &state->id_hca_hdl) != IBT_SUCCESS) { 3105 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 3106 return (DDI_FAILURE); 3107 } 3108 3109 mutex_enter(&state->id_link_mutex); 3110 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 3111 state->id_port, &port_infop, &psize, 3112 &port_infosz); 3113 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 3114 mutex_exit(&state->id_link_mutex); 3115 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 3116 (void) ibt_close_hca(state->id_hca_hdl); 3117 return (DDI_FAILURE); 3118 } 3119 3120 /* 3121 * If the link already went down by the time we get here, give up; 3122 * we can not even get the gid since that is not valid. We would 3123 * fail in ibd_find_bgroup() anyway. 3124 */ 3125 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3126 mutex_exit(&state->id_link_mutex); 3127 ibt_free_portinfo(port_infop, port_infosz); 3128 (void) ibt_close_hca(state->id_hca_hdl); 3129 ibd_print_warn(state, "Port is not active"); 3130 return (DDI_FAILURE); 3131 } 3132 3133 /* 3134 * This verifies the Pkey ibnexus handed us is still valid. 3135 * This is also the point from which the pkey table for the 3136 * port must hold the exact pkey value at the exact index 3137 * across port up/downs. 3138 */ 3139 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3140 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3141 mutex_exit(&state->id_link_mutex); 3142 ibt_free_portinfo(port_infop, port_infosz); 3143 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3144 (void) ibt_close_hca(state->id_hca_hdl); 3145 return (DDI_FAILURE); 3146 } 3147 3148 state->id_mtu = (128 << port_infop->p_mtu); 3149 state->id_sgid = *port_infop->p_sgid_tbl; 3150 state->id_link_state = LINK_STATE_UP; 3151 mutex_exit(&state->id_link_mutex); 3152 3153 ibt_free_portinfo(port_infop, port_infosz); 3154 3155 state->id_link_speed = ibd_get_portspeed(state); 3156 3157 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3158 ASSERT(ibt_status == IBT_SUCCESS); 3159 3160 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3161 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3162 goto drv_init_fail_find_bgroup; 3163 } 3164 3165 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3166 &state->id_pd_hdl) != IBT_SUCCESS) { 3167 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3168 goto drv_init_fail_alloc_pd; 3169 } 3170 3171 /* Initialize the parallel ARP cache and AHs */ 3172 if (ibd_acache_init(state) != DDI_SUCCESS) { 3173 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3174 goto drv_init_fail_acache; 3175 } 3176 3177 /* 3178 * Check various tunable limits. 3179 */ 3180 if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { 3181 ibd_print_warn(state, "Setting #sgl = %d instead of default %d", 3182 hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); 3183 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3184 } else { 3185 state->id_max_sqseg = IBD_MAX_SQSEG; 3186 } 3187 3188 /* 3189 * First, check #r/s wqes against max channel size. 3190 */ 3191 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3192 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3193 else 3194 state->id_num_rwqe = IBD_NUM_RWQE; 3195 3196 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3197 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3198 else 3199 state->id_num_swqe = IBD_NUM_SWQE; 3200 3201 /* 3202 * Allocate Rx/combined CQ: 3203 * Theoretically, there is no point in having more than #rwqe 3204 * plus #swqe cqe's, except that the CQ will be signalled for 3205 * overflow when the last wqe completes, if none of the previous 3206 * cqe's have been polled. Thus, we allocate just a few less wqe's 3207 * to make sure such overflow does not occur. 3208 */ 3209 cq_attr.cq_sched = NULL; 3210 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3211 3212 if (ibd_separate_cqs == 1) { 3213 /* 3214 * Allocate Receive CQ. 3215 */ 3216 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3217 cq_attr.cq_size = state->id_num_rwqe + 1; 3218 } else { 3219 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3220 state->id_num_rwqe = cq_attr.cq_size - 1; 3221 } 3222 3223 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3224 ibd_print_warn(state, "Computed #rwqe %d based on " 3225 "requested size and supportable CQ size is less " 3226 "than the required threshold %d", 3227 state->id_num_rwqe, IBD_RX_THRESHOLD); 3228 goto drv_init_fail_min_rwqes; 3229 } 3230 3231 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3232 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3233 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3234 goto drv_init_fail_alloc_rcq; 3235 } 3236 state->id_rxwcs_size = state->id_num_rwqe + 1; 3237 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3238 state->id_rxwcs_size, KM_SLEEP); 3239 3240 3241 /* 3242 * Allocate Send CQ. 3243 */ 3244 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3245 cq_attr.cq_size = state->id_num_swqe + 1; 3246 } else { 3247 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3248 state->id_num_swqe = cq_attr.cq_size - 1; 3249 } 3250 3251 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3252 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3253 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3254 goto drv_init_fail_alloc_scq; 3255 } 3256 state->id_txwcs_size = state->id_num_swqe + 1; 3257 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 3258 state->id_txwcs_size, KM_SLEEP); 3259 } else { 3260 /* 3261 * Allocate combined Send/Receive CQ. 3262 */ 3263 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3264 state->id_num_swqe + 1)) { 3265 cq_attr.cq_size = state->id_num_rwqe + 3266 state->id_num_swqe + 1; 3267 } else { 3268 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3269 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3270 state->id_num_rwqe) / (state->id_num_rwqe + 3271 state->id_num_swqe); 3272 state->id_num_swqe = cq_attr.cq_size - 1 - 3273 state->id_num_rwqe; 3274 } 3275 3276 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3277 ibd_print_warn(state, "Computed #rwqe %d based on " 3278 "requested size and supportable CQ size is less " 3279 "than the required threshold %d", 3280 state->id_num_rwqe, IBD_RX_THRESHOLD); 3281 goto drv_init_fail_min_rwqes; 3282 } 3283 3284 state->id_rxwcs_size = cq_attr.cq_size; 3285 state->id_txwcs_size = state->id_rxwcs_size; 3286 3287 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3288 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3289 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3290 goto drv_init_fail_alloc_rcq; 3291 } 3292 state->id_scq_hdl = state->id_rcq_hdl; 3293 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3294 state->id_rxwcs_size, KM_SLEEP); 3295 state->id_txwcs = state->id_rxwcs; 3296 } 3297 3298 /* 3299 * Print message in case we could not allocate as many wqe's 3300 * as was requested. Note that in the combined CQ case, we will 3301 * get the following message. 3302 */ 3303 if (state->id_num_rwqe != IBD_NUM_RWQE) 3304 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3305 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3306 if (state->id_num_swqe != IBD_NUM_SWQE) 3307 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3308 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3309 3310 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3311 ud_alloc_attr.ud_hca_port_num = state->id_port; 3312 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3313 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3314 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3315 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3316 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3317 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3318 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3319 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3320 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3321 ud_alloc_attr.ud_clone_chan = NULL; 3322 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3323 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3324 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3325 "\n"); 3326 goto drv_init_fail_alloc_chan; 3327 } 3328 3329 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3330 DDI_SUCCESS) { 3331 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3332 goto drv_init_fail_query_chan; 3333 } 3334 state->id_qpnum = ud_chan_attr.ud_qpn; 3335 3336 /* Initialize the Transmit buffer list */ 3337 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3338 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3339 goto drv_init_fail_txlist_init; 3340 } 3341 3342 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3343 /* Setup the handler we will use for regular DLPI stuff */ 3344 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3345 if (ibt_enable_cq_notify(state->id_scq_hdl, 3346 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3347 DPRINT(10, "ibd_drv_init : failed in" 3348 " ibt_enable_cq_notify()\n"); 3349 goto drv_init_fail_cq_notify; 3350 } 3351 } 3352 3353 /* Create the service fifos before we start receiving */ 3354 if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, 3355 state)) == NULL) { 3356 DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); 3357 goto drv_init_fail_srv_fifo; 3358 } 3359 3360 /* Initialize the Receive buffer list */ 3361 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3362 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3363 goto drv_init_fail_rxlist_init; 3364 } 3365 3366 /* Join to IPoIB broadcast group as required by IPoIB */ 3367 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3368 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3369 goto drv_init_fail_join_group; 3370 } 3371 3372 /* Create the async thread */ 3373 if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3374 TS_RUN, minclsyspri)) == NULL) { 3375 /* Do we have to specially leave the group? */ 3376 DPRINT(10, "ibd_drv_init : failed in thread_create\n"); 3377 goto drv_init_fail_thread_create; 3378 } 3379 state->id_async_thrid = kht->t_did; 3380 3381 /* 3382 * The local mac address is now known. Create the IPoIB 3383 * address. 3384 */ 3385 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3386 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3387 /* 3388 * Similarly, program in the broadcast mac address. 3389 */ 3390 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3391 state->id_mgid.gid_guid); 3392 3393 ptr = (uint32_t *)&state->id_macaddr; 3394 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3395 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3396 ptr = (uint32_t *)&state->id_bcaddr; 3397 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3398 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3399 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3400 state->id_pkey, state->id_mgid.gid_prefix, 3401 state->id_mgid.gid_guid); 3402 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3403 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3404 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3405 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3406 (void) ddi_pathname(state->id_dip, pathname); 3407 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3408 3409 return (DDI_SUCCESS); 3410 3411 drv_init_fail_thread_create: 3412 ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); 3413 3414 drv_init_fail_join_group: 3415 ibd_fini_rxlist(state); 3416 3417 drv_init_fail_rxlist_init: 3418 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3419 3420 drv_init_fail_srv_fifo: 3421 drv_init_fail_cq_notify: 3422 ibd_fini_txlist(state); 3423 3424 drv_init_fail_txlist_init: 3425 drv_init_fail_query_chan: 3426 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3427 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3428 3429 drv_init_fail_alloc_chan: 3430 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3431 IBT_SUCCESS)) 3432 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3433 3434 if (ibd_separate_cqs == 1) 3435 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 3436 state->id_txwcs_size); 3437 3438 drv_init_fail_alloc_scq: 3439 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3440 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3441 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 3442 3443 drv_init_fail_min_rwqes: 3444 drv_init_fail_alloc_rcq: 3445 ibd_acache_fini(state); 3446 drv_init_fail_acache: 3447 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3448 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3449 3450 drv_init_fail_alloc_pd: 3451 ibt_free_mcg_info(state->id_mcinfo, 1); 3452 drv_init_fail_find_bgroup: 3453 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3454 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3455 3456 return (DDI_FAILURE); 3457 } 3458 3459 /* 3460 * Allocate the statically allocated Tx buffer list. 3461 */ 3462 static int 3463 ibd_init_txlist(ibd_state_t *state) 3464 { 3465 ibd_swqe_t *swqe; 3466 int i; 3467 3468 for (i = 0; i < state->id_num_swqe; i++) { 3469 if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { 3470 DPRINT(10, "ibd_init_txlist : failed in " 3471 "ibd_alloc_swqe()\n"); 3472 ibd_fini_txlist(state); 3473 return (DDI_FAILURE); 3474 } 3475 3476 /* add to list */ 3477 state->id_tx_list.dl_cnt++; 3478 if (state->id_tx_list.dl_head == NULL) { 3479 swqe->swqe_prev = NULL; 3480 swqe->swqe_next = NULL; 3481 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3482 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3483 } else { 3484 swqe->swqe_prev = state->id_tx_list.dl_tail; 3485 swqe->swqe_next = NULL; 3486 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3487 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3488 } 3489 } 3490 3491 return (DDI_SUCCESS); 3492 } 3493 3494 /* 3495 * Free the statically allocated Tx buffer list. 3496 */ 3497 static void 3498 ibd_fini_txlist(ibd_state_t *state) 3499 { 3500 ibd_swqe_t *node; 3501 3502 mutex_enter(&state->id_tx_list.dl_mutex); 3503 while (state->id_tx_list.dl_head != NULL) { 3504 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3505 state->id_tx_list.dl_head = node->swqe_next; 3506 state->id_tx_list.dl_cnt--; 3507 ASSERT(state->id_tx_list.dl_cnt >= 0); 3508 ibd_free_swqe(state, node); 3509 } 3510 mutex_exit(&state->id_tx_list.dl_mutex); 3511 } 3512 3513 /* 3514 * Allocate a single send wqe and register it so it is almost 3515 * ready to be posted to the hardware. 3516 */ 3517 static int 3518 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) 3519 { 3520 ibt_mr_attr_t mem_attr; 3521 ibd_swqe_t *swqe; 3522 3523 swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); 3524 *wqe = swqe; 3525 swqe->swqe_type = IBD_WQE_SEND; 3526 swqe->swqe_next = NULL; 3527 swqe->swqe_prev = NULL; 3528 swqe->swqe_im_mblk = NULL; 3529 3530 /* alloc copy buffer, must be max size to handle multiple mblk case */ 3531 swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); 3532 3533 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3534 mem_attr.mr_len = state->id_mtu; 3535 mem_attr.mr_as = NULL; 3536 mem_attr.mr_flags = IBT_MR_SLEEP; 3537 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3538 &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != 3539 IBT_SUCCESS) { 3540 DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); 3541 kmem_free(swqe->swqe_copybuf.ic_bufaddr, 3542 state->id_mtu); 3543 kmem_free(swqe, sizeof (ibd_swqe_t)); 3544 return (DDI_FAILURE); 3545 } 3546 3547 swqe->swqe_copybuf.ic_sgl.ds_va = 3548 (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3549 swqe->swqe_copybuf.ic_sgl.ds_key = 3550 swqe->swqe_copybuf.ic_mr_desc.md_lkey; 3551 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3552 3553 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3554 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3555 swqe->w_swr.wr_trans = IBT_UD_SRV; 3556 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3557 3558 /* These are set in send */ 3559 swqe->w_swr.wr_nds = 0; 3560 swqe->w_swr.wr_sgl = NULL; 3561 3562 return (DDI_SUCCESS); 3563 } 3564 3565 /* 3566 * Free an allocated send wqe. 3567 */ 3568 static void 3569 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3570 { 3571 3572 if (ibt_deregister_mr(state->id_hca_hdl, 3573 swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3574 DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); 3575 return; 3576 } 3577 kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); 3578 kmem_free(swqe, sizeof (ibd_swqe_t)); 3579 } 3580 3581 /* 3582 * Post a rwqe to the hardware and add it to the Rx list. The 3583 * "recycle" parameter indicates whether an old rwqe is being 3584 * recycled, or this is a new one. 3585 */ 3586 static int 3587 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3588 { 3589 /* 3590 * Here we should add dl_cnt before post recv, because we would 3591 * have to make sure dl_cnt has already updated before 3592 * corresponding ibd_process_rx() is called. 3593 */ 3594 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3595 if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != 3596 IBT_SUCCESS) { 3597 (void) atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 3598 DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); 3599 return (DDI_FAILURE); 3600 } 3601 3602 /* 3603 * Buffers being recycled are already in the list. 3604 */ 3605 if (recycle) 3606 return (DDI_SUCCESS); 3607 3608 mutex_enter(&state->id_rx_list.dl_mutex); 3609 if (state->id_rx_list.dl_head == NULL) { 3610 rwqe->rwqe_prev = NULL; 3611 rwqe->rwqe_next = NULL; 3612 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3613 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3614 } else { 3615 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3616 rwqe->rwqe_next = NULL; 3617 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3618 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3619 } 3620 mutex_exit(&state->id_rx_list.dl_mutex); 3621 3622 return (DDI_SUCCESS); 3623 } 3624 3625 /* 3626 * Allocate the statically allocated Rx buffer list. 3627 */ 3628 static int 3629 ibd_init_rxlist(ibd_state_t *state) 3630 { 3631 ibd_rwqe_t *rwqe; 3632 int i; 3633 3634 for (i = 0; i < state->id_num_rwqe; i++) { 3635 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3636 ibd_fini_rxlist(state); 3637 return (DDI_FAILURE); 3638 } 3639 3640 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3641 ibd_free_rwqe(state, rwqe); 3642 ibd_fini_rxlist(state); 3643 return (DDI_FAILURE); 3644 } 3645 } 3646 3647 return (DDI_SUCCESS); 3648 } 3649 3650 /* 3651 * Free the statically allocated Rx buffer list. 3652 * 3653 */ 3654 static void 3655 ibd_fini_rxlist(ibd_state_t *state) 3656 { 3657 ibd_rwqe_t *node; 3658 3659 mutex_enter(&state->id_rx_list.dl_mutex); 3660 while (state->id_rx_list.dl_head != NULL) { 3661 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3662 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3663 state->id_rx_list.dl_cnt--; 3664 ASSERT(state->id_rx_list.dl_cnt >= 0); 3665 3666 ibd_free_rwqe(state, node); 3667 } 3668 mutex_exit(&state->id_rx_list.dl_mutex); 3669 } 3670 3671 /* 3672 * Allocate a single recv wqe and register it so it is almost 3673 * ready to be posted to the hardware. 3674 */ 3675 static int 3676 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3677 { 3678 ibt_mr_attr_t mem_attr; 3679 ibd_rwqe_t *rwqe; 3680 3681 if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3682 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3683 return (DDI_FAILURE); 3684 } 3685 *wqe = rwqe; 3686 rwqe->rwqe_type = IBD_WQE_RECV; 3687 rwqe->w_state = state; 3688 rwqe->rwqe_next = NULL; 3689 rwqe->rwqe_prev = NULL; 3690 rwqe->w_freeing_wqe = B_FALSE; 3691 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3692 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3693 3694 if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3695 IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { 3696 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); 3697 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3698 return (DDI_FAILURE); 3699 } 3700 3701 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3702 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3703 NULL) { 3704 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3705 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3706 state->id_mtu + IPOIB_GRH_SIZE); 3707 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3708 return (DDI_FAILURE); 3709 } 3710 3711 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3712 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3713 mem_attr.mr_as = NULL; 3714 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3715 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3716 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3717 IBT_SUCCESS) { 3718 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3719 rwqe->w_freeing_wqe = B_TRUE; 3720 freemsg(rwqe->rwqe_im_mblk); 3721 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3722 state->id_mtu + IPOIB_GRH_SIZE); 3723 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3724 return (DDI_FAILURE); 3725 } 3726 3727 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3728 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3729 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3730 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3731 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3732 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3733 rwqe->w_rwr.wr_nds = 1; 3734 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3735 3736 return (DDI_SUCCESS); 3737 } 3738 3739 /* 3740 * Free an allocated recv wqe. 3741 */ 3742 static void 3743 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3744 { 3745 3746 if (ibt_deregister_mr(state->id_hca_hdl, 3747 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3748 DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); 3749 return; 3750 } 3751 3752 /* 3753 * Indicate to the callback function that this rwqe/mblk 3754 * should not be recycled. The freemsg() will invoke 3755 * ibd_freemsg_cb(). 3756 */ 3757 if (rwqe->rwqe_im_mblk != NULL) { 3758 rwqe->w_freeing_wqe = B_TRUE; 3759 freemsg(rwqe->rwqe_im_mblk); 3760 } 3761 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3762 state->id_mtu + IPOIB_GRH_SIZE); 3763 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3764 } 3765 3766 /* 3767 * Delete the rwqe being freed from the rx list. 3768 */ 3769 static void 3770 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3771 { 3772 mutex_enter(&state->id_rx_list.dl_mutex); 3773 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3774 state->id_rx_list.dl_head = rwqe->rwqe_next; 3775 else 3776 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3777 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3778 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3779 else 3780 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3781 mutex_exit(&state->id_rx_list.dl_mutex); 3782 } 3783 3784 /* 3785 * Pre ibt_detach() deconstruction. 3786 */ 3787 static void 3788 ibd_drv_fini(ibd_state_t *state) 3789 { 3790 ib_gid_t mgid; 3791 ibd_mce_t *mce; 3792 ibt_status_t status; 3793 uint8_t jstate; 3794 3795 /* 3796 * Desubscribe from trap notices; we will be tearing down 3797 * the mcg lists soon. Make sure the trap handler does nothing 3798 * even if it is invoked (ie till we invoke ibt_detach()). 3799 */ 3800 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 3801 mutex_enter(&state->id_trap_lock); 3802 state->id_trap_stop = B_TRUE; 3803 while (state->id_trap_inprog > 0) 3804 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 3805 mutex_exit(&state->id_trap_lock); 3806 3807 /* 3808 * Flushing the channel ensures that all pending WQE's 3809 * are marked with flush_error and handed to the CQ. It 3810 * does not guarantee the invocation of the CQ handler. 3811 * This call is guaranteed to return successfully for UD QPNs. 3812 */ 3813 status = ibt_flush_channel(state->id_chnl_hdl); 3814 ASSERT(status == IBT_SUCCESS); 3815 3816 /* 3817 * We possibly need a loop here to wait for all the Tx 3818 * callbacks to happen. The Tx handlers will retrieve 3819 * held resources like AH ac_ref count, registered memory 3820 * and possibly ASYNC_REAP requests. Rx interrupts were already 3821 * turned off (in ibd_detach()); turn off Tx interrupts and 3822 * poll. By the time the polling returns an empty indicator, 3823 * we are sure we have seen all pending Tx callbacks. Note 3824 * that after the ibt_set_cq_handler() returns, the old handler 3825 * is guaranteed not to be invoked anymore. 3826 */ 3827 if (ibd_separate_cqs == 1) 3828 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 3829 ibd_poll_compq(state, state->id_scq_hdl); 3830 3831 /* 3832 * No more async requests will be posted since the device has been 3833 * unregistered; completion handlers have been turned off, so Tx 3834 * handler will not cause any more ASYNC_REAP requests. Queue a 3835 * request for the async thread to exit, which will be serviced 3836 * after any pending ones. This can take a while, specially if the 3837 * SM is unreachable, since IBMF will slowly timeout each SM request 3838 * issued by the async thread. Reap the thread before continuing on, 3839 * we do not want it to be lingering in modunloaded code. 3840 */ 3841 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); 3842 thread_join(state->id_async_thrid); 3843 3844 /* 3845 * We can not be in promiscuous mode anymore, upper layers 3846 * would have made a request to disable it (if ever set previously) 3847 * before the detach is allowed to progress to this point; and the 3848 * aysnc thread would have processed that request by now. Thus the 3849 * nonmember list is guaranteed empty at this point. 3850 */ 3851 ASSERT(state->id_prom_op != COMPLETED); 3852 3853 /* 3854 * Drop all residual full/non membership. This includes full 3855 * membership to the broadcast group, and any nonmembership 3856 * acquired during transmits. We do this after the Tx completion 3857 * handlers are done, since those might result in some late 3858 * leaves; this also eliminates a potential race with that 3859 * path wrt the mc full list insert/delete. Trap handling 3860 * has also been suppressed at this point. Thus, no locks 3861 * are required while traversing the mc full list. 3862 */ 3863 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 3864 mce = list_head(&state->id_mc_full); 3865 while (mce != NULL) { 3866 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3867 jstate = mce->mc_jstate; 3868 mce = list_next(&state->id_mc_full, mce); 3869 ibd_leave_group(state, mgid, jstate); 3870 } 3871 3872 ibt_free_mcg_info(state->id_mcinfo, 1); 3873 3874 /* 3875 * Kill the channel now; guaranteed to return successfully 3876 * for UD QPNs. 3877 */ 3878 status = ibt_free_channel(state->id_chnl_hdl); 3879 ASSERT(status == IBT_SUCCESS); 3880 3881 /* 3882 * Kill the CQ; all completion handlers are guaranteed to 3883 * have terminated by the time this returns. Since we killed 3884 * the QPN above, we can not receive the IBT_CQ_BUSY error. 3885 */ 3886 status = ibt_free_cq(state->id_rcq_hdl); 3887 ASSERT(status == IBT_SUCCESS); 3888 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 3889 3890 if (ibd_separate_cqs == 1) { 3891 status = ibt_free_cq(state->id_scq_hdl); 3892 ASSERT(status == IBT_SUCCESS); 3893 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 3894 state->id_txwcs_size); 3895 } 3896 3897 /* 3898 * We killed the receive interrupts, thus, we will not be 3899 * required to handle received packets anymore. Thus, kill 3900 * service threads since they are not going to be used anymore. 3901 */ 3902 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3903 3904 /* 3905 * Since these following will act on the Rx/Tx list, which 3906 * is also looked at by the Rx/Tx handlers, keep them around 3907 * till all handlers are guaranteed to have completed. 3908 */ 3909 ibd_fini_rxlist(state); 3910 ibd_fini_txlist(state); 3911 3912 /* 3913 * Clean up the active AH hash list. 3914 */ 3915 mod_hash_destroy_hash(state->id_ah_active_hash); 3916 3917 /* 3918 * Free parallel ARP cache and AHs; we are sure all of these 3919 * resources have been released by the Tx completion handler. 3920 */ 3921 ibd_acache_fini(state); 3922 3923 /* 3924 * We freed the QPN, all the MRs and AHs. This step should not 3925 * fail; print a warning message if it does fail, due to a bug 3926 * in the driver. 3927 */ 3928 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3929 ibd_print_warn(state, "failed to free protection domain"); 3930 3931 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3932 ibd_print_warn(state, "failed to close HCA device"); 3933 } 3934 3935 /* 3936 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3937 * threaded and nonreentrant for this CQ. When using combined CQ, 3938 * this handles Tx and Rx completions. With separate CQs, this handles 3939 * only Rx completions. 3940 */ 3941 /* ARGSUSED */ 3942 static void 3943 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3944 { 3945 ibd_state_t *state = (ibd_state_t *)arg; 3946 3947 atomic_add_64(&state->id_num_intrs, 1); 3948 3949 if (ibd_rx_softintr == 1) 3950 ddi_trigger_softintr(state->id_rx); 3951 else 3952 (void) ibd_intr((char *)state); 3953 } 3954 3955 /* 3956 * Separate CQ handler for Tx completions, when the Tx CQ is in 3957 * interrupt driven mode. 3958 */ 3959 /* ARGSUSED */ 3960 static void 3961 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3962 { 3963 ibd_state_t *state = (ibd_state_t *)arg; 3964 3965 atomic_add_64(&state->id_num_intrs, 1); 3966 3967 if (ibd_tx_softintr == 1) 3968 ddi_trigger_softintr(state->id_tx); 3969 else 3970 (void) ibd_tx_recycle((char *)state); 3971 } 3972 3973 /* 3974 * Multicast group create/delete trap handler. These will be delivered 3975 * on a kernel thread (handling can thus block) and can be invoked 3976 * concurrently. The handler can be invoked anytime after it is 3977 * registered and before ibt_detach(). 3978 */ 3979 /* ARGSUSED */ 3980 static void 3981 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3982 ibt_subnet_event_t *event) 3983 { 3984 ibd_state_t *state = (ibd_state_t *)arg; 3985 ibd_req_t *req; 3986 3987 /* 3988 * The trap handler will get invoked once for every event for 3989 * evert port. The input "gid" is the GID0 of the port the 3990 * trap came in on; we just need to act on traps that came 3991 * to our port, meaning the port on which the ipoib interface 3992 * resides. Since ipoib uses GID0 of the port, we just match 3993 * the gids to check whether we need to handle the trap. 3994 */ 3995 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3996 return; 3997 3998 DPRINT(10, "ibd_notices_handler : %d\n", code); 3999 4000 switch (code) { 4001 case IBT_SM_EVENT_UNAVAILABLE: 4002 /* 4003 * If we are in promiscuous mode or have 4004 * sendnonmembers, we need to print a warning 4005 * message right now. Else, just store the 4006 * information, print when we enter promiscuous 4007 * mode or attempt nonmember send. We might 4008 * also want to stop caching sendnonmember. 4009 */ 4010 ibd_print_warn(state, "IBA multicast support " 4011 "degraded due to unavailability of multicast " 4012 "traps"); 4013 break; 4014 case IBT_SM_EVENT_AVAILABLE: 4015 /* 4016 * If we printed a warning message above or 4017 * while trying to nonmember send or get into 4018 * promiscuous mode, print an okay message. 4019 */ 4020 ibd_print_warn(state, "IBA multicast support " 4021 "restored due to availability of multicast " 4022 "traps"); 4023 break; 4024 case IBT_SM_EVENT_MCG_CREATED: 4025 case IBT_SM_EVENT_MCG_DELETED: 4026 /* 4027 * Common processing of creation/deletion traps. 4028 * First check if the instance is being 4029 * [de]initialized; back off then, without doing 4030 * anything more, since we are not sure if the 4031 * async thread is around, or whether we might 4032 * be racing with the detach code in ibd_drv_fini() 4033 * that scans the mcg list. 4034 */ 4035 if (!ibd_async_safe(state)) 4036 return; 4037 4038 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4039 req->rq_gid = event->sm_notice_gid; 4040 req->rq_ptr = (void *)code; 4041 ibd_queue_work_slot(state, req, ASYNC_TRAP); 4042 break; 4043 } 4044 } 4045 4046 static void 4047 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4048 { 4049 ib_gid_t mgid = req->rq_gid; 4050 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4051 4052 DPRINT(10, "ibd_async_trap : %d\n", code); 4053 4054 /* 4055 * Atomically search the nonmember and sendonlymember lists and 4056 * delete. 4057 */ 4058 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4059 4060 if (state->id_prom_op == COMPLETED) { 4061 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4062 4063 /* 4064 * If in promiscuous mode, try to join/attach to the new 4065 * mcg. Given the unreliable out-of-order mode of trap 4066 * delivery, we can never be sure whether it is a problem 4067 * if the join fails. Thus, we warn the admin of a failure 4068 * if this was a creation trap. Note that the trap might 4069 * actually be reporting a long past event, and the mcg 4070 * might already have been deleted, thus we might be warning 4071 * in vain. 4072 */ 4073 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4074 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4075 ibd_print_warn(state, "IBA promiscuous mode missed " 4076 "new multicast gid %016llx:%016llx", 4077 (u_longlong_t)mgid.gid_prefix, 4078 (u_longlong_t)mgid.gid_guid); 4079 } 4080 4081 /* 4082 * Free the request slot allocated by the subnet event thread. 4083 */ 4084 ibd_async_done(state); 4085 } 4086 4087 /* 4088 * GLDv3 entry point to get capabilities. 4089 */ 4090 static boolean_t 4091 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4092 { 4093 _NOTE(ARGUNUSED(arg)); 4094 4095 switch (cap) { 4096 case MAC_CAPAB_HCKSUM: { 4097 uint32_t *txflags = cap_data; 4098 4099 if (ibd_csum_send > IBD_CSUM_NONE) 4100 *txflags = HCKSUM_INET_PARTIAL; 4101 else 4102 return (B_FALSE); 4103 break; 4104 } 4105 default: 4106 return (B_FALSE); 4107 } 4108 return (B_TRUE); 4109 } 4110 4111 /* 4112 * GLDv3 entry point to start hardware. 4113 */ 4114 /* ARGSUSED */ 4115 static int 4116 ibd_m_start(void *arg) 4117 { 4118 return (0); 4119 } 4120 4121 /* 4122 * GLDv3 entry point to stop hardware from receiving packets. 4123 */ 4124 /* ARGSUSED */ 4125 static void 4126 ibd_m_stop(void *arg) 4127 { 4128 #ifdef RUN_PERFORMANCE 4129 ibd_perf((ibd_state_t *)arg); 4130 #endif 4131 } 4132 4133 /* 4134 * GLDv3 entry point to modify device's mac address. We do not 4135 * allow address modifications. 4136 */ 4137 static int 4138 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4139 { 4140 ibd_state_t *state; 4141 4142 state = (ibd_state_t *)arg; 4143 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4144 return (0); 4145 else 4146 return (EINVAL); 4147 } 4148 4149 /* 4150 * The blocking part of the IBA join/leave operations are done out 4151 * of here on the async thread. 4152 */ 4153 static void 4154 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4155 { 4156 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4157 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4158 4159 if (op == ASYNC_JOIN) { 4160 4161 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4162 ibd_print_warn(state, "Joint multicast group failed :" 4163 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4164 } 4165 } else { 4166 /* 4167 * Here, we must search for the proper mcg_info and 4168 * use that to leave the group. 4169 */ 4170 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4171 } 4172 } 4173 4174 /* 4175 * GLDv3 entry point for multicast enable/disable requests. 4176 * This function queues the operation to the async thread and 4177 * return success for a valid multicast address. 4178 */ 4179 static int 4180 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4181 { 4182 ibd_state_t *state = (ibd_state_t *)arg; 4183 ipoib_mac_t maddr, *mcast; 4184 ib_gid_t mgid; 4185 ibd_req_t *req; 4186 4187 /* 4188 * The incoming multicast address might not be aligned properly 4189 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4190 * it to look like one though, to get the offsets of the mc gid, 4191 * since we know we are not going to dereference any values with 4192 * the ipoib_mac_t pointer. 4193 */ 4194 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4195 mcast = &maddr; 4196 4197 /* 4198 * Check validity of MCG address. We could additionally check 4199 * that a enable/disable is not being issued on the "broadcast" 4200 * mcg, but since this operation is only invokable by priviledged 4201 * programs anyway, we allow the flexibility to those dlpi apps. 4202 * Note that we do not validate the "scope" of the IBA mcg. 4203 */ 4204 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4205 return (EINVAL); 4206 4207 /* 4208 * fill in multicast pkey and scope 4209 */ 4210 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4211 4212 /* 4213 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4214 * nothing (ie we stay JOINed to the broadcast group done in 4215 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4216 * requires to be joined to broadcast groups at all times. 4217 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4218 * depends on this. 4219 */ 4220 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4221 return (0); 4222 4223 ibd_n2h_gid(mcast, &mgid); 4224 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4225 if (req == NULL) 4226 return (ENOMEM); 4227 4228 req->rq_gid = mgid; 4229 4230 if (add) { 4231 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4232 mgid.gid_prefix, mgid.gid_guid); 4233 ibd_queue_work_slot(state, req, ASYNC_JOIN); 4234 } else { 4235 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4236 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4237 ibd_queue_work_slot(state, req, ASYNC_LEAVE); 4238 } 4239 return (0); 4240 } 4241 4242 /* 4243 * The blocking part of the IBA promiscuous operations are done 4244 * out of here on the async thread. The dlpireq parameter indicates 4245 * whether this invocation is due to a dlpi request or due to 4246 * a port up/down event. 4247 */ 4248 static void 4249 ibd_async_unsetprom(ibd_state_t *state) 4250 { 4251 ibd_mce_t *mce = list_head(&state->id_mc_non); 4252 ib_gid_t mgid; 4253 4254 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4255 4256 while (mce != NULL) { 4257 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4258 mce = list_next(&state->id_mc_non, mce); 4259 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4260 } 4261 state->id_prom_op = NOTSTARTED; 4262 } 4263 4264 /* 4265 * The blocking part of the IBA promiscuous operations are done 4266 * out of here on the async thread. The dlpireq parameter indicates 4267 * whether this invocation is due to a dlpi request or due to 4268 * a port up/down event. 4269 */ 4270 static void 4271 ibd_async_setprom(ibd_state_t *state) 4272 { 4273 ibt_mcg_attr_t mcg_attr; 4274 ibt_mcg_info_t *mcg_info; 4275 ib_gid_t mgid; 4276 uint_t numg; 4277 int i, ret = COMPLETED; 4278 4279 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4280 4281 /* 4282 * Obtain all active MC groups on the IB fabric with 4283 * specified criteria (scope + Pkey + Qkey + mtu). 4284 */ 4285 bzero(&mcg_attr, sizeof (mcg_attr)); 4286 mcg_attr.mc_pkey = state->id_pkey; 4287 mcg_attr.mc_scope = state->id_scope; 4288 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4289 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4290 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4291 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4292 IBT_SUCCESS) { 4293 ibd_print_warn(state, "Could not get list of IBA multicast " 4294 "groups"); 4295 ret = ERRORED; 4296 goto done; 4297 } 4298 4299 /* 4300 * Iterate over the returned mcg's and join as NonMember 4301 * to the IP mcg's. 4302 */ 4303 for (i = 0; i < numg; i++) { 4304 /* 4305 * Do a NonMember JOIN on the MC group. 4306 */ 4307 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4308 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4309 ibd_print_warn(state, "IBA promiscuous mode missed " 4310 "multicast gid %016llx:%016llx", 4311 (u_longlong_t)mgid.gid_prefix, 4312 (u_longlong_t)mgid.gid_guid); 4313 } 4314 4315 ibt_free_mcg_info(mcg_info, numg); 4316 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4317 done: 4318 state->id_prom_op = ret; 4319 } 4320 4321 /* 4322 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4323 * GLDv3 assumes phys state receives more packets than multi state, 4324 * which is not true for IPoIB. Thus, treat the multi and phys 4325 * promiscuous states the same way to work with GLDv3's assumption. 4326 */ 4327 static int 4328 ibd_m_promisc(void *arg, boolean_t on) 4329 { 4330 ibd_state_t *state = (ibd_state_t *)arg; 4331 ibd_req_t *req; 4332 4333 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4334 if (req == NULL) 4335 return (ENOMEM); 4336 if (on) { 4337 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4338 ibd_queue_work_slot(state, req, ASYNC_PROMON); 4339 } else { 4340 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4341 ibd_queue_work_slot(state, req, ASYNC_PROMOFF); 4342 } 4343 4344 return (0); 4345 } 4346 4347 /* 4348 * GLDv3 entry point for gathering statistics. 4349 */ 4350 static int 4351 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4352 { 4353 ibd_state_t *state = (ibd_state_t *)arg; 4354 4355 switch (stat) { 4356 case MAC_STAT_IFSPEED: 4357 *val = state->id_link_speed; 4358 break; 4359 case MAC_STAT_MULTIRCV: 4360 *val = state->id_multi_rcv; 4361 break; 4362 case MAC_STAT_BRDCSTRCV: 4363 *val = state->id_brd_rcv; 4364 break; 4365 case MAC_STAT_MULTIXMT: 4366 *val = state->id_multi_xmt; 4367 break; 4368 case MAC_STAT_BRDCSTXMT: 4369 *val = state->id_brd_xmt; 4370 break; 4371 case MAC_STAT_RBYTES: 4372 *val = state->id_recv_bytes; 4373 break; 4374 case MAC_STAT_IPACKETS: 4375 *val = state->id_rcv_pkt; 4376 break; 4377 case MAC_STAT_OBYTES: 4378 *val = state->id_xmt_bytes; 4379 break; 4380 case MAC_STAT_OPACKETS: 4381 *val = state->id_xmt_pkt; 4382 break; 4383 case MAC_STAT_NORCVBUF: 4384 *val = state->id_rx_short; /* # times below water mark */ 4385 break; 4386 case MAC_STAT_OERRORS: 4387 *val = state->id_ah_error; /* failed AH translation */ 4388 break; 4389 case MAC_STAT_IERRORS: 4390 *val = 0; 4391 break; 4392 case MAC_STAT_NOXMTBUF: 4393 *val = state->id_tx_short; 4394 break; 4395 default: 4396 return (ENOTSUP); 4397 } 4398 4399 return (0); 4400 } 4401 4402 /* 4403 * Tx reschedule 4404 */ 4405 static void 4406 ibd_async_txsched(ibd_state_t *state) 4407 { 4408 ibd_req_t *req; 4409 4410 /* 4411 * For poll mode, if ibd is out of Tx wqe, reschedule to collect 4412 * the CQEs. Otherwise, just return for out of Tx wqe. 4413 */ 4414 4415 if (ibd_txcomp_poll == 1) { 4416 mutex_enter(&state->id_txcomp_lock); 4417 ibd_poll_compq(state, state->id_scq_hdl); 4418 mutex_exit(&state->id_txcomp_lock); 4419 if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { 4420 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4421 ibd_queue_work_slot(state, req, ASYNC_SCHED); 4422 return; 4423 } 4424 } else if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { 4425 return; 4426 } 4427 4428 if (state->id_sched_needed) { 4429 mac_tx_update(state->id_mh); 4430 state->id_sched_needed = B_FALSE; 4431 } 4432 } 4433 4434 /* 4435 * Release one or more chained send wqes back into free list. 4436 */ 4437 static void 4438 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *swqe) 4439 { 4440 /* 4441 * Add back on Tx list for reuse. 4442 */ 4443 swqe->swqe_next = NULL; 4444 mutex_enter(&state->id_tx_list.dl_mutex); 4445 if (state->id_tx_list.dl_pending_sends) { 4446 state->id_tx_list.dl_pending_sends = B_FALSE; 4447 } 4448 if (state->id_tx_list.dl_head == NULL) { 4449 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 4450 } else { 4451 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 4452 } 4453 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 4454 state->id_tx_list.dl_cnt++; 4455 mutex_exit(&state->id_tx_list.dl_mutex); 4456 } 4457 4458 /* 4459 * Acquire send wqe from free list. 4460 * Returns error number and send wqe pointer. 4461 */ 4462 static int 4463 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **swqe) 4464 { 4465 int rc = 0; 4466 ibd_swqe_t *wqe; 4467 4468 /* 4469 * Check and reclaim some of the completed Tx requests. 4470 * If someone else is already in this code and pulling Tx 4471 * completions, no need to poll, since the current lock holder 4472 * will do the work anyway. Normally, we poll for completions 4473 * every few Tx attempts, but if we are short on Tx descriptors, 4474 * we always try to poll. 4475 */ 4476 if ((ibd_txcomp_poll == 1) && 4477 (state->id_tx_list.dl_cnt < IBD_TXPOLL_THRESHOLD) && 4478 (mutex_tryenter(&state->id_txcomp_lock) != 0)) { 4479 DPRINT(10, "ibd_send : polling"); 4480 ibd_poll_compq(state, state->id_scq_hdl); 4481 mutex_exit(&state->id_txcomp_lock); 4482 } 4483 4484 /* 4485 * Grab required transmit wqes. 4486 */ 4487 mutex_enter(&state->id_tx_list.dl_mutex); 4488 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 4489 if (wqe != NULL) { 4490 state->id_tx_list.dl_cnt -= 1; 4491 state->id_tx_list.dl_head = wqe->swqe_next; 4492 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 4493 state->id_tx_list.dl_tail = NULL; 4494 } else { 4495 /* 4496 * If we did not find the number we were looking for, flag 4497 * no resource. Adjust list appropriately in either case. 4498 */ 4499 rc = ENOENT; 4500 state->id_tx_list.dl_pending_sends = B_TRUE; 4501 DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); 4502 atomic_add_64(&state->id_tx_short, 1); 4503 } 4504 mutex_exit(&state->id_tx_list.dl_mutex); 4505 *swqe = wqe; 4506 4507 return (rc); 4508 } 4509 4510 /* 4511 * The passed in packet has this format: 4512 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 4513 */ 4514 static boolean_t 4515 ibd_send(ibd_state_t *state, mblk_t *mp) 4516 { 4517 ibt_status_t ibt_status; 4518 ibt_mr_attr_t mem_attr; 4519 ibd_ace_t *ace; 4520 ibd_swqe_t *node = NULL; 4521 ipoib_mac_t *dest; 4522 ibd_req_t *req; 4523 ib_header_info_t *ipibp; 4524 ip6_t *ip6h; 4525 mblk_t *nmp = mp; 4526 uint_t pktsize; 4527 size_t blksize; 4528 uchar_t *bufp; 4529 int i, ret, len, nmblks = 1; 4530 boolean_t dofree = B_TRUE; 4531 4532 if ((ret = ibd_acquire_swqes(state, &node)) != 0) { 4533 state->id_sched_needed = B_TRUE; 4534 if (ibd_txcomp_poll == 1) { 4535 goto ibd_send_fail; 4536 } 4537 return (B_FALSE); 4538 } 4539 4540 /* 4541 * Obtain an address handle for the destination. 4542 */ 4543 ipibp = (ib_header_info_t *)mp->b_rptr; 4544 dest = (ipoib_mac_t *)&ipibp->ib_dst; 4545 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 4546 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 4547 4548 pktsize = msgsize(mp); 4549 atomic_add_64(&state->id_xmt_bytes, pktsize); 4550 atomic_inc_64(&state->id_xmt_pkt); 4551 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4552 atomic_inc_64(&state->id_brd_xmt); 4553 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 4554 atomic_inc_64(&state->id_multi_xmt); 4555 4556 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 4557 node->w_ahandle = ace; 4558 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4559 } else { 4560 DPRINT(5, 4561 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 4562 ((ret == EFAULT) ? "failed" : "queued"), 4563 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 4564 htonl(dest->ipoib_gidpref[1]), 4565 htonl(dest->ipoib_gidsuff[0]), 4566 htonl(dest->ipoib_gidsuff[1])); 4567 node->w_ahandle = NULL; 4568 /* 4569 * for the poll mode, it is probably some cqe pending in the 4570 * cq. So ibd has to poll cq here, otherwise acache probably 4571 * may not be recycled. 4572 */ 4573 if (ibd_txcomp_poll == 1) { 4574 mutex_enter(&state->id_txcomp_lock); 4575 ibd_poll_compq(state, state->id_scq_hdl); 4576 mutex_exit(&state->id_txcomp_lock); 4577 } 4578 /* 4579 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 4580 * can not find a path for the specific dest address. We 4581 * should get rid of this kind of packet. With the normal 4582 * case, ibd will return the packet to upper layer and wait 4583 * for AH creating. 4584 */ 4585 if (ret == EFAULT) 4586 ret = B_TRUE; 4587 else { 4588 ret = B_FALSE; 4589 dofree = B_FALSE; 4590 state->id_sched_needed = B_TRUE; 4591 } 4592 goto ibd_send_fail; 4593 } 4594 4595 /* 4596 * For ND6 packets, padding is at the front of the source lladdr. 4597 * Insert the padding at front. 4598 */ 4599 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 4600 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 4601 if (!pullupmsg(mp, IPV6_HDR_LEN + 4602 sizeof (ib_header_info_t))) { 4603 DPRINT(10, "ibd_send: pullupmsg failure "); 4604 ret = B_TRUE; 4605 goto ibd_send_fail; 4606 } 4607 ipibp = (ib_header_info_t *)mp->b_rptr; 4608 } 4609 ip6h = (ip6_t *)((uchar_t *)ipibp + 4610 sizeof (ib_header_info_t)); 4611 len = ntohs(ip6h->ip6_plen); 4612 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 4613 mblk_t *pad; 4614 4615 pad = allocb(4, 0); 4616 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 4617 linkb(mp, pad); 4618 if (MBLKL(mp) < sizeof (ib_header_info_t) + 4619 IPV6_HDR_LEN + len + 4) { 4620 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 4621 IPV6_HDR_LEN + len + 4)) { 4622 DPRINT(10, "ibd_send: pullupmsg " 4623 "failure "); 4624 ret = B_TRUE; 4625 goto ibd_send_fail; 4626 } 4627 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 4628 sizeof (ib_header_info_t)); 4629 } 4630 4631 /* LINTED: E_CONSTANT_CONDITION */ 4632 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 4633 } 4634 } 4635 4636 mp->b_rptr += sizeof (ib_addrs_t); 4637 while (((nmp = nmp->b_cont) != NULL) && 4638 (++nmblks < (state->id_max_sqseg + 1))) 4639 ; 4640 4641 pktsize = msgsize(mp); 4642 /* 4643 * GLDv3 will check mtu. We do checksum related work here. 4644 */ 4645 IBD_CKSUM_SEND(mp); 4646 4647 /* 4648 * Copy the data to preregistered buffers, or register the buffer. 4649 */ 4650 if ((nmblks <= state->id_max_sqseg) && 4651 (pktsize > IBD_TX_COPY_THRESHOLD)) { 4652 for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { 4653 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; 4654 mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; 4655 mem_attr.mr_as = NULL; 4656 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4657 ibt_status = ibt_register_mr(state->id_hca_hdl, 4658 state->id_pd_hdl, &mem_attr, 4659 &node->w_smblkbuf[i].im_mr_hdl, 4660 &node->w_smblkbuf[i].im_mr_desc); 4661 if (ibt_status != IBT_SUCCESS) { 4662 /* 4663 * We do not expect any error other than 4664 * IBT_INSUFF_RESOURCE. 4665 */ 4666 if (ibt_status != IBT_INSUFF_RESOURCE) 4667 DPRINT(10, "ibd_send: %d\n", 4668 "failed in ibt_register_mem()", 4669 ibt_status); 4670 DPRINT(5, "ibd_send: registration failed"); 4671 node->w_swr.wr_nds = i; 4672 /* 4673 * Deregister already registered memory; 4674 * fallback to copying the mblk. 4675 */ 4676 ibd_deregister_mr(state, node); 4677 goto ibd_copy_path; 4678 } 4679 node->w_smblk_sgl[i].ds_va = 4680 (ib_vaddr_t)(uintptr_t)nmp->b_rptr; 4681 node->w_smblk_sgl[i].ds_key = 4682 node->w_smblkbuf[i].im_mr_desc.md_lkey; 4683 node->w_smblk_sgl[i].ds_len = 4684 nmp->b_wptr - nmp->b_rptr; 4685 } 4686 node->swqe_im_mblk = mp; 4687 node->w_swr.wr_sgl = node->w_smblk_sgl; 4688 node->w_swr.wr_nds = nmblks; 4689 dofree = B_FALSE; 4690 } else { 4691 ibd_copy_path: 4692 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 4693 node->w_swr.wr_nds = 1; 4694 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 4695 4696 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 4697 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 4698 blksize = MBLKL(nmp); 4699 bcopy(nmp->b_rptr, bufp, blksize); 4700 bufp += blksize; 4701 } 4702 } 4703 4704 /* 4705 * Queue the wqe to hardware. 4706 */ 4707 ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); 4708 if (ibt_status != IBT_SUCCESS) { 4709 /* 4710 * We should not fail here; but just in case we do, we 4711 * print out a warning to log. 4712 */ 4713 ibd_print_warn(state, "ibd_send: posting failed: %d", 4714 ibt_status); 4715 } 4716 4717 DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", 4718 INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), 4719 htonl(ace->ac_mac.ipoib_gidpref[0]), 4720 htonl(ace->ac_mac.ipoib_gidpref[1]), 4721 htonl(ace->ac_mac.ipoib_gidsuff[0]), 4722 htonl(ace->ac_mac.ipoib_gidsuff[1])); 4723 4724 if (dofree) 4725 freemsg(mp); 4726 4727 return (B_TRUE); 4728 4729 ibd_send_fail: 4730 if (state->id_sched_needed == B_TRUE) { 4731 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4732 if (req != NULL) 4733 ibd_queue_work_slot(state, req, ASYNC_SCHED); 4734 else { 4735 dofree = B_TRUE; 4736 ret = B_TRUE; 4737 } 4738 } 4739 4740 if (dofree) 4741 freemsg(mp); 4742 4743 if (node != NULL) 4744 ibd_tx_cleanup(state, node); 4745 4746 return (ret); 4747 } 4748 4749 /* 4750 * GLDv3 entry point for transmitting datagram. 4751 */ 4752 static mblk_t * 4753 ibd_m_tx(void *arg, mblk_t *mp) 4754 { 4755 ibd_state_t *state = (ibd_state_t *)arg; 4756 mblk_t *next; 4757 4758 while (mp != NULL) { 4759 next = mp->b_next; 4760 mp->b_next = NULL; 4761 if (!ibd_send(state, mp)) { 4762 /* Send fail */ 4763 mp->b_next = next; 4764 break; 4765 } 4766 mp = next; 4767 } 4768 4769 return (mp); 4770 } 4771 4772 /* 4773 * this handles Tx and Rx completions. With separate CQs, this handles 4774 * only Rx completions. 4775 */ 4776 static uint_t 4777 ibd_intr(char *arg) 4778 { 4779 ibd_state_t *state = (ibd_state_t *)arg; 4780 /* 4781 * Poll for completed entries; the CQ will not interrupt any 4782 * more for incoming (or transmitted) packets. 4783 */ 4784 ibd_poll_compq(state, state->id_rcq_hdl); 4785 4786 /* 4787 * Now enable CQ notifications; all packets that arrive now 4788 * (or complete transmission) will cause new interrupts. 4789 */ 4790 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 4791 IBT_SUCCESS) { 4792 /* 4793 * We do not expect a failure here. 4794 */ 4795 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 4796 } 4797 4798 /* 4799 * Repoll to catch all packets that might have arrived after 4800 * we finished the first poll loop and before interrupts got 4801 * armed. 4802 */ 4803 ibd_poll_compq(state, state->id_rcq_hdl); 4804 4805 return (DDI_INTR_CLAIMED); 4806 } 4807 4808 /* 4809 * Common code for interrupt handling as well as for polling 4810 * for all completed wqe's while detaching. 4811 */ 4812 static void 4813 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 4814 { 4815 ibd_wqe_t *wqe; 4816 ibt_wc_t *wc, *wcs; 4817 uint_t numwcs, real_numwcs; 4818 int i; 4819 4820 /* 4821 * In some cases (eg detaching), this code can be invoked on 4822 * any cpu after disabling cq notification (thus no concurrency 4823 * exists). Apart from that, the following applies normally: 4824 * The receive completion handling is always on the Rx interrupt 4825 * cpu. Transmit completion handling could be from any cpu if 4826 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 4827 * is interrupt driven. Combined completion handling is always 4828 * on the interrupt cpu. Thus, lock accordingly and use the 4829 * proper completion array. 4830 */ 4831 if (ibd_separate_cqs == 1) { 4832 if (cq_hdl == state->id_rcq_hdl) { 4833 wcs = state->id_rxwcs; 4834 numwcs = state->id_rxwcs_size; 4835 } else { 4836 wcs = state->id_txwcs; 4837 numwcs = state->id_txwcs_size; 4838 } 4839 } else { 4840 wcs = state->id_rxwcs; 4841 numwcs = state->id_rxwcs_size; 4842 } 4843 4844 if (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 4845 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 4846 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 4847 ASSERT((wqe->w_type == IBD_WQE_SEND) || 4848 (wqe->w_type == IBD_WQE_RECV)); 4849 if (wc->wc_status != IBT_WC_SUCCESS) { 4850 /* 4851 * Channel being torn down. 4852 */ 4853 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 4854 DPRINT(5, "ibd_intr: flush error"); 4855 /* 4856 * Only invoke the Tx handler to 4857 * release possibly held resources 4858 * like AH refcount etc. Can not 4859 * invoke Rx handler because it might 4860 * try adding buffers to the Rx pool 4861 * when we are trying to deinitialize. 4862 */ 4863 if (wqe->w_type == IBD_WQE_RECV) { 4864 continue; 4865 } else { 4866 DPRINT(10, "%s %d", 4867 "ibd_intr: Bad CQ status", 4868 wc->wc_status); 4869 } 4870 } 4871 } 4872 if (wqe->w_type == IBD_WQE_SEND) { 4873 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 4874 } else { 4875 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 4876 } 4877 } 4878 } 4879 } 4880 4881 /* 4882 * Deregister the mr associated with a given mblk. 4883 */ 4884 static void 4885 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) 4886 { 4887 int i; 4888 4889 DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, 4890 swqe->w_swr.wr_nds); 4891 4892 for (i = 0; i < swqe->w_swr.wr_nds; i++) { 4893 if (ibt_deregister_mr(state->id_hca_hdl, 4894 swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { 4895 /* 4896 * We do not expect any errors here. 4897 */ 4898 DPRINT(10, "failed in ibt_deregister_mem()\n"); 4899 } 4900 } 4901 } 4902 4903 /* 4904 * Common code that deals with clean ups after a successful or 4905 * erroneous transmission attempt. 4906 */ 4907 static void 4908 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 4909 { 4910 ibd_ace_t *ace = swqe->w_ahandle; 4911 4912 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 4913 4914 /* 4915 * If this was a dynamic registration in ibd_send(), 4916 * deregister now. 4917 */ 4918 if (swqe->swqe_im_mblk != NULL) { 4919 ibd_deregister_mr(state, swqe); 4920 freemsg(swqe->swqe_im_mblk); 4921 swqe->swqe_im_mblk = NULL; 4922 } 4923 4924 /* 4925 * Drop the reference count on the AH; it can be reused 4926 * now for a different destination if there are no more 4927 * posted sends that will use it. This can be eliminated 4928 * if we can always associate each Tx buffer with an AH. 4929 * The ace can be null if we are cleaning up from the 4930 * ibd_send() error path. 4931 */ 4932 if (ace != NULL) { 4933 /* 4934 * The recycling logic can be eliminated from here 4935 * and put into the async thread if we create another 4936 * list to hold ACE's for unjoined mcg's. 4937 */ 4938 if (DEC_REF_DO_CYCLE(ace)) { 4939 ibd_mce_t *mce; 4940 4941 /* 4942 * Check with the lock taken: we decremented 4943 * reference count without the lock, and some 4944 * transmitter might alreay have bumped the 4945 * reference count (possible in case of multicast 4946 * disable when we leave the AH on the active 4947 * list). If not still 0, get out, leaving the 4948 * recycle bit intact. 4949 * 4950 * Atomically transition the AH from active 4951 * to free list, and queue a work request to 4952 * leave the group and destroy the mce. No 4953 * transmitter can be looking at the AH or 4954 * the MCE in between, since we have the 4955 * ac_mutex lock. In the SendOnly reap case, 4956 * it is not neccesary to hold the ac_mutex 4957 * and recheck the ref count (since the AH was 4958 * taken off the active list), we just do it 4959 * to have uniform processing with the Full 4960 * reap case. 4961 */ 4962 mutex_enter(&state->id_ac_mutex); 4963 mce = ace->ac_mce; 4964 if (GET_REF_CYCLE(ace) == 0) { 4965 CLEAR_REFCYCLE(ace); 4966 /* 4967 * Identify the case of fullmember reap as 4968 * opposed to mcg trap reap. Also, port up 4969 * might set ac_mce to NULL to indicate Tx 4970 * cleanup should do no more than put the 4971 * AH in the free list (see ibd_async_link). 4972 */ 4973 if (mce != NULL) { 4974 ace->ac_mce = NULL; 4975 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 4976 /* 4977 * mc_req was initialized at mce 4978 * creation time. 4979 */ 4980 ibd_queue_work_slot(state, 4981 &mce->mc_req, ASYNC_REAP); 4982 } 4983 IBD_ACACHE_INSERT_FREE(state, ace); 4984 } 4985 mutex_exit(&state->id_ac_mutex); 4986 } 4987 } 4988 4989 /* 4990 * Release the send wqe for reuse. 4991 */ 4992 ibd_release_swqes(state, swqe); 4993 } 4994 4995 /* 4996 * Processing to be done after receipt of a packet; hand off to GLD 4997 * in the format expected by GLD. 4998 * The recvd packet has this format: 2b sap :: 00 :: data. 4999 */ 5000 static void 5001 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5002 { 5003 ib_header_info_t *phdr; 5004 mblk_t *mp; 5005 ipoib_hdr_t *ipibp; 5006 ip6_t *ip6h; 5007 int rxcnt, len; 5008 5009 /* 5010 * Track number handed to upper layer, and number still 5011 * available to receive packets. 5012 */ 5013 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5014 ASSERT(rxcnt >= 0); 5015 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5016 5017 /* 5018 * Adjust write pointer depending on how much data came in. 5019 */ 5020 mp = rwqe->rwqe_im_mblk; 5021 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5022 5023 /* 5024 * the IB link will deliver one of the IB link layer 5025 * headers called, the Global Routing Header (GRH). 5026 * ibd driver uses the information in GRH to build the 5027 * Header_info structure and pass it with the datagram up 5028 * to GLDv3. 5029 * If the GRH is not valid, indicate to GLDv3 by setting 5030 * the VerTcFlow field to 0. 5031 */ 5032 phdr = (ib_header_info_t *)mp->b_rptr; 5033 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 5034 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 5035 5036 /* if it is loop back packet, just drop it. */ 5037 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 5038 IPOIB_ADDRL) == 0) { 5039 freemsg(mp); 5040 return; 5041 } 5042 5043 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 5044 sizeof (ipoib_mac_t)); 5045 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 5046 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 5047 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 5048 } else { 5049 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 5050 } 5051 } else { 5052 /* 5053 * It can not be a IBA multicast packet. Must have been 5054 * unicast for us. Just copy the interface address to dst. 5055 */ 5056 phdr->ib_grh.ipoib_vertcflow = 0; 5057 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 5058 sizeof (ipoib_mac_t)); 5059 } 5060 5061 DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); 5062 5063 /* 5064 * For ND6 packets, padding is at the front of the source/target 5065 * lladdr. However the inet6 layer is not aware of it, hence remove 5066 * the padding from such packets. 5067 */ 5068 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5069 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5070 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5071 if (!pullupmsg(mp, IPV6_HDR_LEN + 5072 sizeof (ipoib_hdr_t))) { 5073 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5074 freemsg(mp); 5075 return; 5076 } 5077 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 5078 sizeof (ipoib_pgrh_t)); 5079 } 5080 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5081 len = ntohs(ip6h->ip6_plen); 5082 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5083 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5084 IPV6_HDR_LEN + len) { 5085 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5086 IPV6_HDR_LEN + len)) { 5087 DPRINT(10, "ibd_process_rx: pullupmsg" 5088 " failed"); 5089 freemsg(mp); 5090 return; 5091 } 5092 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5093 sizeof (ipoib_pgrh_t) + 5094 sizeof (ipoib_hdr_t)); 5095 } 5096 /* LINTED: E_CONSTANT_CONDITION */ 5097 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5098 } 5099 } 5100 5101 atomic_add_64(&state->id_recv_bytes, wc->wc_bytes_xfer); 5102 atomic_inc_64(&state->id_rcv_pkt); 5103 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5104 atomic_inc_64(&state->id_brd_rcv); 5105 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5106 atomic_inc_64(&state->id_multi_rcv); 5107 /* 5108 * Hand off to service thread/GLD. When we have hardware that 5109 * does hardware checksum, we will pull the checksum from the 5110 * work completion structure here. 5111 * on interrupt cpu. 5112 */ 5113 ibd_send_up(state, mp); 5114 5115 /* 5116 * Possibly replenish the Rx pool if needed. 5117 */ 5118 if (rxcnt < IBD_RX_THRESHOLD) { 5119 state->id_rx_short++; 5120 if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { 5121 if (ibd_post_rwqe(state, rwqe, B_FALSE) == 5122 DDI_FAILURE) { 5123 ibd_free_rwqe(state, rwqe); 5124 return; 5125 } 5126 } 5127 } 5128 } 5129 5130 /* 5131 * Callback code invoked from STREAMs when the recv data buffer is free 5132 * for recycling. 5133 */ 5134 static void 5135 ibd_freemsg_cb(char *arg) 5136 { 5137 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5138 ibd_state_t *state = rwqe->w_state; 5139 5140 /* 5141 * If the wqe is being destructed, do not attempt recycling. 5142 */ 5143 if (rwqe->w_freeing_wqe == B_TRUE) { 5144 DPRINT(6, "ibd_freemsg: wqe being freed"); 5145 return; 5146 } 5147 5148 /* 5149 * Upper layer has released held mblk. 5150 */ 5151 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5152 5153 if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { 5154 /* 5155 * There are already enough buffers on the Rx ring. 5156 * Free this one up. 5157 */ 5158 rwqe->rwqe_im_mblk = NULL; 5159 ibd_delete_rwqe(state, rwqe); 5160 ibd_free_rwqe(state, rwqe); 5161 DPRINT(6, "ibd_freemsg: free up wqe"); 5162 } else { 5163 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5164 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5165 if (rwqe->rwqe_im_mblk == NULL) { 5166 ibd_delete_rwqe(state, rwqe); 5167 ibd_free_rwqe(state, rwqe); 5168 DPRINT(6, "ibd_freemsg: desballoc failed"); 5169 return; 5170 } 5171 5172 /* 5173 * Post back to h/w. We could actually have more than 5174 * id_num_rwqe WQEs on the list if there were multiple 5175 * ibd_freemsg_cb() calls outstanding (since the lock is 5176 * not held the entire time). This will start getting 5177 * corrected over subsequent ibd_freemsg_cb() calls. 5178 */ 5179 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5180 ibd_delete_rwqe(state, rwqe); 5181 ibd_free_rwqe(state, rwqe); 5182 return; 5183 } 5184 } 5185 } 5186 5187 static uint_t 5188 ibd_tx_recycle(char *arg) 5189 { 5190 ibd_state_t *state = (ibd_state_t *)arg; 5191 5192 /* 5193 * Poll for completed entries; the CQ will not interrupt any 5194 * more for completed packets. 5195 */ 5196 ibd_poll_compq(state, state->id_scq_hdl); 5197 5198 /* 5199 * Now enable CQ notifications; all completions originating now 5200 * will cause new interrupts. 5201 */ 5202 if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != 5203 IBT_SUCCESS) { 5204 /* 5205 * We do not expect a failure here. 5206 */ 5207 DPRINT(10, "ibd_tx_recycle: ibt_enable_cq_notify() failed"); 5208 } 5209 5210 /* 5211 * Repoll to catch all packets that might have completed after 5212 * we finished the first poll loop and before interrupts got 5213 * armed. 5214 */ 5215 ibd_poll_compq(state, state->id_scq_hdl); 5216 5217 /* 5218 * Call txsched to notify GLDv3 if it required. 5219 */ 5220 ibd_async_txsched(state); 5221 5222 return (DDI_INTR_CLAIMED); 5223 } 5224 #ifdef RUN_PERFORMANCE 5225 5226 /* 5227 * To run the performance test, first do the "ifconfig ibdN plumb" on 5228 * the Rx and Tx side. Then use mdb -kw to tweak the following variables: 5229 * ibd_performance=1. 5230 * ibd_receiver=1 on Rx side. 5231 * ibd_sender=1 on Tx side. 5232 * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update 5233 * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will 5234 * make it drop into a 1 minute loop waiting for packets. An 5235 * ifconfig/unplumb on the Tx will cause it to send packets to Rx. 5236 */ 5237 5238 #define IBD_NUM_UNSIGNAL ibd_num_unsignal 5239 #define IBD_TX_PKTSIZE ibd_tx_pktsize 5240 #define IBD_TX_DATASIZE ibd_tx_datasize 5241 5242 static ibd_swqe_t **swqes; 5243 static ibt_wc_t *wcs; 5244 5245 /* 5246 * Set these on Rx and Tx side to do performance run. 5247 */ 5248 static int ibd_performance = 0; 5249 static int ibd_receiver = 0; 5250 static int ibd_sender = 0; 5251 static ipoib_mac_t ibd_dest; 5252 5253 /* 5254 * Interrupt coalescing is achieved by asking for a completion intr 5255 * only every ibd_num_unsignal'th packet. 5256 */ 5257 static int ibd_num_unsignal = 8; 5258 5259 /* 5260 * How big is each packet? 5261 */ 5262 static int ibd_tx_pktsize = 2048; 5263 5264 /* 5265 * Total data size to be transmitted. 5266 */ 5267 static int ibd_tx_datasize = 512*1024*1024; 5268 5269 static volatile boolean_t cq_handler_ran = B_FALSE; 5270 static volatile int num_completions; 5271 5272 /* ARGSUSED */ 5273 static void 5274 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) 5275 { 5276 ibd_state_t *state = (ibd_state_t *)arg; 5277 ibt_cq_hdl_t cqhdl; 5278 ibd_wqe_t *wqe; 5279 uint_t polled, i; 5280 boolean_t cq_enabled = B_FALSE; 5281 5282 if (ibd_receiver == 1) 5283 cqhdl = state->id_rcq_hdl; 5284 else 5285 cqhdl = state->id_scq_hdl; 5286 5287 /* 5288 * Mark the handler as having run and possibly freed up some 5289 * slots. Blocked sends can be retried. 5290 */ 5291 cq_handler_ran = B_TRUE; 5292 5293 repoll: 5294 while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == 5295 IBT_SUCCESS) { 5296 num_completions += polled; 5297 if (ibd_receiver == 1) { 5298 /* 5299 * We can immediately recycle the buffer. No 5300 * need to pass up to any IP layer ... 5301 */ 5302 for (i = 0; i < polled; i++) { 5303 wqe = (ibd_wqe_t *)wcs[i].wc_id; 5304 (void) ibt_post_recv(state->id_chnl_hdl, 5305 &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); 5306 } 5307 } 5308 } 5309 5310 /* 5311 * If we just repolled, we are done; exit. 5312 */ 5313 if (cq_enabled) 5314 return; 5315 5316 /* 5317 * Enable CQ. 5318 */ 5319 if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 5320 /* 5321 * We do not expect a failure here. 5322 */ 5323 cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); 5324 } 5325 cq_enabled = B_TRUE; 5326 5327 /* 5328 * Repoll for packets that came in after we finished previous 5329 * poll loop but before we turned on notifications. 5330 */ 5331 goto repoll; 5332 } 5333 5334 static void 5335 ibd_perf_tx(ibd_state_t *state) 5336 { 5337 ibt_mr_hdl_t mrhdl; 5338 ibt_mr_desc_t mrdesc; 5339 ibt_mr_attr_t mem_attr; 5340 ibt_status_t stat; 5341 ibd_ace_t *ace = NULL; 5342 ibd_swqe_t *node; 5343 uchar_t *sendbuf; 5344 longlong_t stime, etime; 5345 longlong_t sspin, espin, tspin = 0; 5346 int i, reps, packets; 5347 5348 cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", 5349 htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), 5350 htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), 5351 htonl(ibd_dest.ipoib_gidsuff[1])); 5352 if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || 5353 (ibd_dest.ipoib_gidpref[1] == 0)) { 5354 cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); 5355 return; 5356 } 5357 5358 packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); 5359 reps = (packets / IBD_NUM_SWQE); 5360 5361 cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); 5362 cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); 5363 cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); 5364 cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); 5365 cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); 5366 if ((packets % IBD_NUM_UNSIGNAL) != 0) { 5367 /* 5368 * This is required to ensure the last packet will trigger 5369 * a CQ handler callback, thus we can spin waiting fot all 5370 * packets to be received. 5371 */ 5372 cmn_err(CE_CONT, 5373 "ibd_perf_tx: #Packets not multiple of Signal Grp size"); 5374 return; 5375 } 5376 num_completions = 0; 5377 5378 swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, 5379 KM_NOSLEEP); 5380 if (swqes == NULL) { 5381 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5382 return; 5383 } 5384 5385 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5386 if (wcs == NULL) { 5387 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5388 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5389 return; 5390 } 5391 5392 /* 5393 * Get the ud_dest for the destination. 5394 */ 5395 ibd_async_acache(state, &ibd_dest); 5396 mutex_enter(&state->id_ac_mutex); 5397 ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); 5398 mutex_exit(&state->id_ac_mutex); 5399 if (ace == NULL) { 5400 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5401 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5402 cmn_err(CE_CONT, "ibd_perf_tx: no AH"); 5403 return; 5404 } 5405 5406 /* 5407 * Set up the send buffer. 5408 */ 5409 sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); 5410 if (sendbuf == NULL) { 5411 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5412 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5413 cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); 5414 return; 5415 } 5416 5417 /* 5418 * This buffer can be used in the case when we want to 5419 * send data from the same memory area over and over; 5420 * it might help in reducing memory traffic. 5421 */ 5422 mem_attr.mr_vaddr = (uint64_t)sendbuf; 5423 mem_attr.mr_len = IBD_TX_PKTSIZE; 5424 mem_attr.mr_as = NULL; 5425 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5426 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 5427 &mrhdl, &mrdesc) != IBT_SUCCESS) { 5428 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5429 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5430 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5431 cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); 5432 return; 5433 } 5434 5435 /* 5436 * Allocate private send wqe's. 5437 */ 5438 for (i = 0; i < IBD_NUM_SWQE; i++) { 5439 if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { 5440 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5441 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5442 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5443 cmn_err(CE_CONT, "ibd_alloc_swqe failure"); 5444 return; 5445 } 5446 node->w_ahandle = ace; 5447 #if 0 5448 node->w_smblkbuf[0].im_mr_hdl = mrhdl; 5449 node->w_smblkbuf[0].im_mr_desc = mrdesc; 5450 node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; 5451 node->w_smblk_sgl[0].ds_key = 5452 node->w_smblkbuf[0].im_mr_desc.md_lkey; 5453 node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; 5454 node->w_swr.wr_sgl = node->w_smblk_sgl; 5455 #else 5456 node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; 5457 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5458 #endif 5459 5460 /* 5461 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs 5462 * is marked to invoke the CQ handler. That is the only 5463 * way we come to know when the send queue can accept more 5464 * WRs. 5465 */ 5466 if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) 5467 node->w_swr.wr_flags = IBT_WR_NO_FLAGS; 5468 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5469 node->w_swr.wr_nds = 1; 5470 5471 swqes[i] = node; 5472 } 5473 5474 ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); 5475 5476 /* 5477 * Post all the requests. We expect this stream of post's will 5478 * not overwhelm the hardware due to periodic completions and 5479 * pollings that happen out of ibd_perf_handler. 5480 * Post a set of requests, till the channel can accept; after 5481 * that, wait for the CQ handler to notify us that there is more 5482 * space. 5483 */ 5484 stime = gethrtime(); 5485 for (; reps > 0; reps--) 5486 for (i = 0; i < IBD_NUM_SWQE; i++) { 5487 node = swqes[i]; 5488 retry: 5489 if ((stat = ibt_post_send(state->id_chnl_hdl, 5490 &node->w_swr, 1, NULL)) != IBT_SUCCESS) { 5491 if (stat == IBT_CHAN_FULL) { 5492 /* 5493 * Spin till the CQ handler runs 5494 * and then try again. 5495 */ 5496 sspin = gethrtime(); 5497 while (!cq_handler_ran) 5498 ; 5499 espin = gethrtime(); 5500 tspin += (espin - sspin); 5501 cq_handler_ran = B_FALSE; 5502 goto retry; 5503 } 5504 cmn_err(CE_CONT, "post failure %d/%d", stat, i); 5505 goto done; 5506 } 5507 } 5508 5509 done: 5510 /* 5511 * We should really be snapshotting when we get the last 5512 * completion. 5513 */ 5514 while (num_completions != (packets / IBD_NUM_UNSIGNAL)) 5515 ; 5516 etime = gethrtime(); 5517 5518 cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", 5519 num_completions); 5520 cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); 5521 cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); 5522 5523 /* 5524 * Wait a sec for everything to get over. 5525 */ 5526 delay(drv_usectohz(2000000)); 5527 5528 /* 5529 * Reset CQ handler to real one; free resources. 5530 */ 5531 if (ibd_separate_cqs == 0) { 5532 ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); 5533 } else { 5534 if (ibd_txcomp_poll == 0) 5535 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, 5536 state); 5537 else 5538 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5539 } 5540 5541 for (i = 0; i < IBD_NUM_SWQE; i++) 5542 ibd_free_swqe(state, swqes[i]); 5543 (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); 5544 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5545 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5546 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5547 } 5548 5549 static void 5550 ibd_perf_rx(ibd_state_t *state) 5551 { 5552 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5553 if (wcs == NULL) { 5554 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5555 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5556 return; 5557 } 5558 5559 /* 5560 * We do not need to allocate private recv wqe's. We will 5561 * just use the regular ones. 5562 */ 5563 5564 num_completions = 0; 5565 ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); 5566 5567 /* 5568 * Delay for a minute for all the packets to come in from 5569 * transmitter. 5570 */ 5571 cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); 5572 delay(drv_usectohz(60000000)); 5573 cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); 5574 5575 /* 5576 * Reset CQ handler to real one; free resources. 5577 */ 5578 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5579 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5580 } 5581 5582 static void 5583 ibd_perf(ibd_state_t *state) 5584 { 5585 if (ibd_performance == 0) 5586 return; 5587 5588 if (ibd_receiver == 1) { 5589 ibd_perf_rx(state); 5590 return; 5591 } 5592 5593 if (ibd_sender == 1) { 5594 ibd_perf_tx(state); 5595 return; 5596 } 5597 } 5598 5599 #endif /* RUN_PERFORMANCE */ 5600