1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * An implementation of the IPoIB standard based on PSARC 2001/289. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/conf.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/modctl.h> 37 #include <sys/stropts.h> 38 #include <sys/stream.h> 39 #include <sys/strsun.h> 40 #include <sys/strsubr.h> 41 #include <sys/dlpi.h> 42 43 #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/pattr.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Modes of hardware/driver/software checksum, useful for debugging 69 * and performance studies. 70 * 71 * none: h/w (Tavor) and driver does not do checksum, IP software must. 72 * partial: driver does data checksum, IP must provide psuedo header. 73 * perf_partial: driver uses IP provided psuedo cksum as data checksum 74 * (thus, real checksumming is not done). 75 */ 76 typedef enum { 77 IBD_CSUM_NONE, 78 IBD_CSUM_PARTIAL, 79 IBD_CSUM_PERF_PARTIAL 80 } ibd_csum_type_t; 81 82 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; 83 84 /* 85 * Per interface tunable parameters. 86 */ 87 static uint_t ibd_rx_threshold = 16; 88 static uint_t ibd_tx_current_copy_threshold = 0x10000000; 89 /* should less than max Tavor CQsize and be 2^n - 1 */ 90 static uint_t ibd_num_rwqe = 511; 91 static uint_t ibd_num_swqe = 511; 92 static uint_t ibd_num_ah = 16; 93 static uint_t ibd_hash_size = 16; 94 static uint_t ibd_srv_fifos = 0x0; 95 static uint_t ibd_fifo_depth = 0; 96 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; 97 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; 98 99 /* 100 * The driver can use separate CQs for send and receive queueus. 101 * While using separate CQs, it is possible to put the send CQ 102 * in polling mode, ie not to enable notifications on that CQ. 103 * If both CQs are interrupt driven, currently it is not possible 104 * for their handlers to be invoked concurrently (since Tavor ties 105 * both interrupts to the same PCI intr line); but the handlers 106 * are not coded with a single interrupt cpu assumption (eg 107 * id_num_intrs is incremented atomically). 108 * 109 * The driver private struct uses id_scq_hdl to track the separate 110 * CQ being used for send; the id_rcq_hdl tracks the receive CQ 111 * if using separate CQs, or it tracks the single CQ when using 112 * combined CQ. The id_wcs completion array is used in the combined 113 * CQ case, and for fetching Rx completions in the separate CQs case; 114 * the id_txwcs is used to fetch Tx completions in the separate CQs 115 * case. 116 */ 117 static uint_t ibd_separate_cqs = 1; 118 static uint_t ibd_txcomp_poll = 0; 119 120 /* 121 * the softintr is introduced to avoid Event Queue overflow. It 122 * should not have heavy load in CQ event handle function. 123 * If service fifos is enabled, this is not required, because 124 * mac_rx() will be called by service threads. 125 */ 126 static uint_t ibd_rx_softintr = 1; 127 static uint_t ibd_tx_softintr = 1; 128 129 /* 130 * Initial number of IBA resources allocated. 131 */ 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_SWQE ibd_num_swqe 134 #define IBD_NUM_AH ibd_num_ah 135 136 /* when <= threshold, it's faster to copy to a premapped buffer */ 137 #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold 138 139 /* 140 * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will 141 * allocate a new WQE to put on the the rxlist. This value must be <= 142 * IBD_NUM_RWQE/id_num_rwqe. 143 */ 144 #define IBD_RX_THRESHOLD ibd_rx_threshold 145 146 /* 147 * Hash table size for the active AH list. 148 */ 149 #define IBD_HASH_SIZE ibd_hash_size 150 151 #define IBD_TXPOLL_THRESHOLD 64 152 /* 153 * PAD routine called during send/recv context 154 */ 155 #define IBD_SEND 0 156 #define IBD_RECV 1 157 158 /* 159 * fill / clear in <scope> and <p_key> in multicast/broadcast address. 160 */ 161 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 162 { \ 163 *(uint32_t *)((char *)(maddr) + 4) |= \ 164 htonl((uint32_t)(scope) << 16); \ 165 *(uint32_t *)((char *)(maddr) + 8) |= \ 166 htonl((uint32_t)(pkey) << 16); \ 167 } 168 169 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 170 { \ 171 *(uint32_t *)((char *)(maddr) + 4) &= \ 172 htonl(~((uint32_t)0xF << 16)); \ 173 *(uint32_t *)((char *)(maddr) + 8) &= \ 174 htonl(~((uint32_t)0xFFFF << 16)); \ 175 } 176 177 /* 178 * when free tx wqes >= threshold and reschedule flag is set, 179 * ibd will call mac_tx_update to re-enable Tx. 180 */ 181 #define IBD_TX_UPDATE_THRESHOLD 1 182 183 /* Driver State Pointer */ 184 void *ibd_list; 185 186 /* Required system entry points */ 187 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 188 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 189 190 /* Required driver entry points for GLDv3 */ 191 static int ibd_m_start(void *); 192 static void ibd_m_stop(void *); 193 static int ibd_m_unicst(void *, const uint8_t *); 194 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 195 static int ibd_m_promisc(void *, boolean_t); 196 static int ibd_m_stat(void *, uint_t, uint64_t *); 197 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 198 static mblk_t *ibd_m_tx(void *, mblk_t *); 199 200 /* Private driver entry points for GLDv3 */ 201 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 202 static uint_t ibd_intr(char *); 203 static uint_t ibd_tx_recycle(char *); 204 static int ibd_state_init(ibd_state_t *, dev_info_t *); 205 static void ibd_state_fini(ibd_state_t *); 206 static int ibd_drv_init(ibd_state_t *); 207 static void ibd_drv_fini(ibd_state_t *); 208 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 209 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 210 static void ibd_snet_notices_handler(void *, ib_gid_t, 211 ibt_subnet_event_code_t, ibt_subnet_event_t *); 212 static int ibd_init_txlist(ibd_state_t *); 213 static void ibd_fini_txlist(ibd_state_t *); 214 static int ibd_init_rxlist(ibd_state_t *); 215 static void ibd_fini_rxlist(ibd_state_t *); 216 static void ibd_freemsg_cb(char *); 217 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 218 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 219 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); 220 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 221 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 222 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 223 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 224 ibt_async_event_t *); 225 static int ibd_acache_init(ibd_state_t *); 226 static void ibd_acache_fini(ibd_state_t *); 227 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 228 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 229 static void ibd_async_unsetprom(ibd_state_t *); 230 static void ibd_async_setprom(ibd_state_t *); 231 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 232 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 233 static void ibd_async_txsched(ibd_state_t *); 234 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 235 static void ibd_async_work(ibd_state_t *); 236 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 237 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 238 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 239 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, 240 ipoib_mac_t *); 241 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 242 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); 243 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 244 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 245 static uint64_t ibd_get_portspeed(ibd_state_t *); 246 247 #ifdef RUN_PERFORMANCE 248 static void ibd_perf(ibd_state_t *); 249 #endif 250 251 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 252 nodev, NULL, D_MP, NULL); 253 254 /* Module Driver Info */ 255 static struct modldrv ibd_modldrv = { 256 &mod_driverops, /* This one is a driver */ 257 "InfiniBand GLDv3 Driver 1.3", /* short description */ 258 &ibd_dev_ops /* driver specific ops */ 259 }; 260 261 /* Module Linkage */ 262 static struct modlinkage ibd_modlinkage = { 263 MODREV_1, (void *)&ibd_modldrv, NULL 264 }; 265 266 /* 267 * Module Info passed to IBTL during IBT_ATTACH. 268 * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this 269 * data). 270 */ 271 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 272 IBTI_V2, 273 IBT_NETWORK, 274 ibd_async_handler, 275 NULL, 276 "IPIB" 277 }; 278 279 /* 280 * Async operation types. 281 */ 282 #define ASYNC_GETAH 1 283 #define ASYNC_JOIN 2 284 #define ASYNC_LEAVE 3 285 #define ASYNC_PROMON 4 286 #define ASYNC_PROMOFF 5 287 #define ASYNC_REAP 6 288 #define ASYNC_TRAP 8 289 #define ASYNC_SCHED 9 290 #define ASYNC_LINK 10 291 #define ASYNC_EXIT 11 292 293 /* 294 * Async operation states 295 */ 296 #define NOTSTARTED 0 297 #define ONGOING 1 298 #define COMPLETED 2 299 #define ERRORED 3 300 #define ROUTERED 4 301 302 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF 303 304 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 305 static mac_callbacks_t ib_m_callbacks = { 306 IBD_M_CALLBACK_FLAGS, 307 ibd_m_stat, 308 ibd_m_start, 309 ibd_m_stop, 310 ibd_m_promisc, 311 ibd_m_multicst, 312 ibd_m_unicst, 313 ibd_m_tx, 314 NULL, 315 NULL, 316 ibd_m_getcapab 317 }; 318 319 #ifdef DEBUG 320 321 static int rxpack = 1, txpack = 1; 322 int ibd_debuglevel = 100; 323 static void 324 debug_print(int l, char *fmt, ...) 325 { 326 va_list ap; 327 328 if (l < ibd_debuglevel) 329 return; 330 va_start(ap, fmt); 331 vcmn_err(CE_CONT, fmt, ap); 332 va_end(ap); 333 } 334 #define INCRXPACK (rxpack++) 335 #define INCTXPACK (txpack++) 336 #define DPRINT debug_print 337 338 #else /* DEBUG */ 339 340 #define INCRXPACK 0 341 #define INCTXPACK 0 342 #define DPRINT 343 344 #endif /* DEBUG */ 345 346 /* 347 * Common routine to print warning messages; adds in hca guid, port number 348 * and pkey to be able to identify the IBA interface. 349 */ 350 static void 351 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 352 { 353 ib_guid_t hca_guid; 354 char ibd_print_buf[256]; 355 int len; 356 va_list ap; 357 358 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 359 0, "hca-guid", 0); 360 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 361 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 362 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 363 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 364 va_start(ap, fmt); 365 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 366 fmt, ap); 367 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 368 va_end(ap); 369 } 370 371 /* warlock directives */ 372 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 373 ibd_state_t::id_ah_active)) 374 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) 375 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 376 ibd_state_t::id_req_list)) 377 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 378 ibd_state_t::id_acache_req_cv)) 379 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 380 ibd_state_t::id_mc_full)) 381 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 382 ibd_state_t::id_mc_non)) 383 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 384 ibd_state_t::id_link_state)) 385 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 386 ibd_state_s::id_tx_list)) 387 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 388 ibd_state_s::id_rx_list)) 389 390 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) 391 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) 392 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) 393 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) 394 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) 395 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) 396 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) 397 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) 398 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) 399 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) 400 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) 401 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) 402 403 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) 404 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) 405 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) 406 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) 407 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) 408 409 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) 410 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) 411 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) 412 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) 413 414 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) 415 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) 416 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) 417 418 #ifdef DEBUG 419 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) 420 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) 421 #endif 422 423 int 424 _init() 425 { 426 int status; 427 428 /* 429 * Sanity check some parameter settings. Tx completion polling 430 * only makes sense with separate CQs for Tx and Rx. 431 */ 432 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 433 cmn_err(CE_NOTE, "!ibd: %s", 434 "Setting ibd_txcomp_poll = 0 for combined CQ"); 435 ibd_txcomp_poll = 0; 436 } 437 438 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 439 if (status != 0) { 440 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 441 return (status); 442 } 443 444 mac_init_ops(&ibd_dev_ops, "ibd"); 445 status = mod_install(&ibd_modlinkage); 446 if (status != 0) { 447 DPRINT(10, "_init:failed in mod_install()"); 448 ddi_soft_state_fini(&ibd_list); 449 mac_fini_ops(&ibd_dev_ops); 450 return (status); 451 } 452 453 return (0); 454 } 455 456 int 457 _info(struct modinfo *modinfop) 458 { 459 return (mod_info(&ibd_modlinkage, modinfop)); 460 } 461 462 int 463 _fini() 464 { 465 int status; 466 467 status = mod_remove(&ibd_modlinkage); 468 if (status != 0) 469 return (status); 470 471 mac_fini_ops(&ibd_dev_ops); 472 ddi_soft_state_fini(&ibd_list); 473 return (0); 474 } 475 476 /* 477 * Convert the GID part of the mac address from network byte order 478 * to host order. 479 */ 480 static void 481 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 482 { 483 ib_sn_prefix_t nbopref; 484 ib_guid_t nboguid; 485 486 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 487 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 488 dgid->gid_prefix = b2h64(nbopref); 489 dgid->gid_guid = b2h64(nboguid); 490 } 491 492 /* 493 * Create the IPoIB address in network byte order from host order inputs. 494 */ 495 static void 496 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 497 ib_guid_t guid) 498 { 499 ib_sn_prefix_t nbopref; 500 ib_guid_t nboguid; 501 502 mac->ipoib_qpn = htonl(qpn); 503 nbopref = h2b64(prefix); 504 nboguid = h2b64(guid); 505 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 506 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 507 } 508 509 /* 510 * Send to the appropriate all-routers group when the IBA multicast group 511 * does not exist, based on whether the target group is v4 or v6. 512 */ 513 static boolean_t 514 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 515 ipoib_mac_t *rmac) 516 { 517 boolean_t retval = B_TRUE; 518 uint32_t adjscope = state->id_scope << 16; 519 uint32_t topword; 520 521 /* 522 * Copy the first 4 bytes in without assuming any alignment of 523 * input mac address; this will have IPoIB signature, flags and 524 * scope bits. 525 */ 526 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 527 topword = ntohl(topword); 528 529 /* 530 * Generate proper address for IPv4/v6, adding in the Pkey properly. 531 */ 532 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 533 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 534 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 535 ((uint32_t)(state->id_pkey << 16))), 536 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 537 else 538 /* 539 * Does not have proper bits in the mgid address. 540 */ 541 retval = B_FALSE; 542 543 return (retval); 544 } 545 546 /* 547 * Implementation of various (software) flavors of send and receive side 548 * checksumming. 549 */ 550 #define IBD_CKSUM_SEND(mp) { \ 551 uint32_t start, stuff, end, value, flags; \ 552 uint32_t cksum, sum; \ 553 uchar_t *dp, *buf; \ 554 uint16_t *up; \ 555 \ 556 if (ibd_csum_send == IBD_CSUM_NONE) \ 557 goto punt_send; \ 558 \ 559 /* \ 560 * Query IP whether Tx cksum needs to be done. \ 561 */ \ 562 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ 563 &value, &flags); \ 564 \ 565 if (flags == HCK_PARTIALCKSUM) { \ 566 dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ 567 up = (uint16_t *)(dp + stuff); \ 568 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 569 end = ((uchar_t *)mp->b_wptr - dp - start); \ 570 cksum = *up; \ 571 *up = 0; \ 572 /* \ 573 * Does NOT handle chained mblks/more than one \ 574 * SGL. Applicable only for a single SGL \ 575 * entry/mblk, where the stuff offset is \ 576 * within the range of buf. \ 577 */ \ 578 buf = (dp + start); \ 579 sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ 580 } else { \ 581 sum = *up; \ 582 } \ 583 DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ 584 start, stuff, end, sum, cksum); \ 585 sum = ~(sum); \ 586 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 587 } \ 588 punt_send: \ 589 ; \ 590 } 591 592 #define IBD_CKSUM_RECV(mp) { \ 593 uchar_t *dp, *buf; \ 594 uint32_t start, end, value, stuff, flags; \ 595 uint16_t *up, frag; \ 596 ipha_t *iphp; \ 597 ipoib_hdr_t *ipibh; \ 598 \ 599 if (ibd_csum_recv == IBD_CSUM_NONE) \ 600 goto punt_recv; \ 601 \ 602 ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ 603 if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ 604 goto punt_recv; \ 605 \ 606 dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ 607 iphp = (ipha_t *)dp; \ 608 frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ 609 if ((frag) & (~IPH_DF)) \ 610 goto punt_recv; \ 611 start = IPH_HDR_LENGTH(iphp); \ 612 if (iphp->ipha_protocol == IPPROTO_TCP) \ 613 stuff = start + 16; \ 614 else if (iphp->ipha_protocol == IPPROTO_UDP) \ 615 stuff = start + 6; \ 616 else \ 617 goto punt_recv; \ 618 \ 619 flags = HCK_PARTIALCKSUM; \ 620 end = ntohs(iphp->ipha_length); \ 621 up = (uint16_t *)(dp + stuff); \ 622 \ 623 if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ 624 buf = (dp + start); \ 625 value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ 626 } else { \ 627 value = (*up); \ 628 } \ 629 if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ 630 value, flags, 0) != 0) \ 631 DPRINT(10, "cksum_recv: value: %x\n", value); \ 632 punt_recv: \ 633 ; \ 634 } 635 636 /* 637 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 638 * front of optional src/tgt link layer address. Right now Solaris inserts 639 * padding by default at the end. The routine which is doing is nce_xmit() 640 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 641 * the packet comes down from IP layer to the IBD driver, it is in the 642 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 643 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 644 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 645 * 646 * The send routine at IBD driver changes this packet as follows: 647 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 648 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 649 * aligned. 650 * 651 * At the receiving side again ibd_process_rx takes the above packet and 652 * removes the two bytes of front padding and inserts it at the end. This 653 * is since the IP layer does not understand padding at the front. 654 */ 655 #define IBD_PAD_NSNA(ip6h, len, type) { \ 656 uchar_t *nd_lla_ptr; \ 657 icmp6_t *icmp6; \ 658 nd_opt_hdr_t *opt; \ 659 int i; \ 660 \ 661 icmp6 = (icmp6_t *)&ip6h[1]; \ 662 len -= sizeof (nd_neighbor_advert_t); \ 663 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 664 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 665 (len != 0)) { \ 666 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 667 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 668 ASSERT(opt != NULL); \ 669 nd_lla_ptr = (uchar_t *)&opt[1]; \ 670 if (type == 0) { \ 671 for (i = IPOIB_ADDRL; i > 0; i--) \ 672 *(nd_lla_ptr + i + 1) = \ 673 *(nd_lla_ptr + i - 1); \ 674 } else { \ 675 for (i = 0; i < IPOIB_ADDRL; i++) \ 676 *(nd_lla_ptr + i) = \ 677 *(nd_lla_ptr + i + 2); \ 678 } \ 679 *(nd_lla_ptr + i) = 0; \ 680 *(nd_lla_ptr + i + 1) = 0; \ 681 } \ 682 } 683 684 /* 685 * The service fifo code is copied verbatim from Cassini. This can be 686 * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. 687 */ 688 689 typedef caddr_t fifo_obj_t, *p_fifo_obj_t; 690 691 typedef struct _srv_fifo_t { 692 kmutex_t fifo_lock; 693 kcondvar_t fifo_cv; 694 size_t size; 695 uint_t max_index; 696 uint_t rd_index; 697 uint_t wr_index; 698 uint_t objs_pending; 699 p_fifo_obj_t fifo_objs; 700 kthread_t *fifo_thread; 701 void (*drain_func)(caddr_t drain_func_arg); 702 caddr_t drain_func_arg; 703 boolean_t running; 704 callb_cpr_t cprinfo; 705 } srv_fifo_t, *p_srv_fifo_t; 706 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) 707 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) 708 709 static int 710 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, 711 void (*drain_func)(), caddr_t drain_func_arg) 712 { 713 int status; 714 p_srv_fifo_t srv_fifo; 715 716 status = DDI_SUCCESS; 717 srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); 718 srv_fifo->size = size; 719 srv_fifo->max_index = size - 1; 720 srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( 721 size * sizeof (fifo_obj_t), KM_SLEEP); 722 mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); 723 cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); 724 srv_fifo->drain_func = drain_func; 725 srv_fifo->drain_func_arg = drain_func_arg; 726 srv_fifo->running = DDI_SUCCESS; 727 srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, 728 (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); 729 if (srv_fifo->fifo_thread == NULL) { 730 cv_destroy(&srv_fifo->fifo_cv); 731 mutex_destroy(&srv_fifo->fifo_lock); 732 kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); 733 kmem_free(srv_fifo, sizeof (srv_fifo_t)); 734 srv_fifo = NULL; 735 status = DDI_FAILURE; 736 } else 737 *handle = srv_fifo; 738 return (status); 739 } 740 741 static void 742 _ddi_srv_fifo_destroy(p_srv_fifo_t handle) 743 { 744 kt_did_t tid = handle->fifo_thread->t_did; 745 746 mutex_enter(&handle->fifo_lock); 747 handle->running = DDI_FAILURE; 748 cv_signal(&handle->fifo_cv); 749 while (handle->running == DDI_FAILURE) 750 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 751 mutex_exit(&handle->fifo_lock); 752 if (handle->objs_pending != 0) 753 cmn_err(CE_NOTE, "!Thread Exit with work undone."); 754 cv_destroy(&handle->fifo_cv); 755 mutex_destroy(&handle->fifo_lock); 756 kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); 757 kmem_free(handle, sizeof (srv_fifo_t)); 758 thread_join(tid); 759 } 760 761 static caddr_t 762 _ddi_srv_fifo_begin(p_srv_fifo_t handle) 763 { 764 #ifndef __lock_lint 765 CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, 766 callb_generic_cpr, "srv_fifo"); 767 #endif /* ! _lock_lint */ 768 return (handle->drain_func_arg); 769 } 770 771 static void 772 _ddi_srv_fifo_end(p_srv_fifo_t handle) 773 { 774 callb_cpr_t cprinfo; 775 776 mutex_enter(&handle->fifo_lock); 777 cprinfo = handle->cprinfo; 778 handle->running = DDI_SUCCESS; 779 cv_signal(&handle->fifo_cv); 780 #ifndef __lock_lint 781 CALLB_CPR_EXIT(&cprinfo); 782 #endif /* ! _lock_lint */ 783 thread_exit(); 784 _NOTE(NOT_REACHED) 785 } 786 787 static int 788 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) 789 { 790 int status; 791 792 mutex_enter(&handle->fifo_lock); 793 status = handle->running; 794 if (status == DDI_SUCCESS) { 795 if (ptr) { 796 if (handle->objs_pending < handle->size) { 797 if (handle->wr_index == handle->max_index) 798 handle->wr_index = 0; 799 else 800 handle->wr_index++; 801 handle->fifo_objs[handle->wr_index] = ptr; 802 handle->objs_pending++; 803 } else 804 status = DDI_FAILURE; 805 if (signal) 806 cv_signal(&handle->fifo_cv); 807 } else { 808 if (signal && (handle->objs_pending > 0)) 809 cv_signal(&handle->fifo_cv); 810 } 811 } 812 mutex_exit(&handle->fifo_lock); 813 return (status); 814 } 815 816 static int 817 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) 818 { 819 int status; 820 821 mutex_enter(&handle->fifo_lock); 822 status = handle->running; 823 if (status == DDI_SUCCESS) { 824 if (handle->objs_pending == 0) { 825 #ifndef __lock_lint 826 CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); 827 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 828 CALLB_CPR_SAFE_END(&handle->cprinfo, 829 &handle->fifo_lock); 830 #endif /* !_lock_lint */ 831 *ptr = NULL; 832 } 833 if (handle->objs_pending > 0) { 834 if (handle->rd_index == handle->max_index) 835 handle->rd_index = 0; 836 else 837 handle->rd_index++; 838 *ptr = handle->fifo_objs[handle->rd_index]; 839 handle->objs_pending--; 840 } 841 status = handle->running; 842 } else { 843 if (handle->objs_pending) { 844 if (handle->rd_index == handle->max_index) 845 handle->rd_index = 0; 846 else 847 handle->rd_index++; 848 *ptr = handle->fifo_objs[handle->rd_index]; 849 handle->objs_pending--; 850 status = DDI_SUCCESS; 851 } else 852 status = DDI_FAILURE; 853 } 854 mutex_exit(&handle->fifo_lock); 855 return (status); 856 } 857 858 /* 859 * [un]map_rx_srv_fifos has been modified from its CE version. 860 */ 861 static void 862 drain_fifo(p_srv_fifo_t handle) 863 { 864 ibd_state_t *state; 865 mblk_t *mp; 866 867 state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); 868 while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { 869 /* 870 * Hand off to GLDv3. 871 */ 872 IBD_CKSUM_RECV(mp); 873 mac_rx(state->id_mh, NULL, mp); 874 } 875 _ddi_srv_fifo_end(handle); 876 } 877 878 static p_srv_fifo_t * 879 map_rx_srv_fifos(int *nfifos, void *private) 880 { 881 p_srv_fifo_t *srv_fifos; 882 int i, inst_taskqs, depth; 883 884 /* 885 * Default behavior on both sparc and amd cpus in terms of 886 * of worker thread is as follows: (N) indicates worker thread 887 * not enabled , (Y) indicates worker thread enabled. Default of 888 * ibd_srv_fifo is set to 0xffff. The default behavior can be 889 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below. 890 * Worker thread model assigns lower priority to network 891 * processing making system more usable at higher network 892 * loads. 893 * ________________________________________________________ 894 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff | 895 * |----------------------|---|---|-------|---|---|---------| 896 * | | Sparc | x86 | 897 * |----------------------|---|---|-------|---|---|---------| 898 * | Single CPU |N | Y | N | N | Y | N | 899 * |----------------------|---|---|-------|---|---|---------| 900 * | Multi CPU |N | Y | Y | N | Y | Y | 901 * |______________________|___|___|_______|___|___|_________| 902 */ 903 if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || 904 (ibd_srv_fifos == 0)) { 905 *nfifos = 0; 906 return ((p_srv_fifo_t *)1); 907 } 908 909 *nfifos = inst_taskqs; 910 srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), 911 KM_SLEEP); 912 913 /* 914 * If the administrator has specified a fifo depth, use 915 * that, else just decide what should be the depth. 916 */ 917 if (ibd_fifo_depth == 0) 918 depth = (IBD_NUM_RWQE / inst_taskqs) + 16; 919 else 920 depth = ibd_fifo_depth; 921 922 for (i = 0; i < inst_taskqs; i++) 923 if (_ddi_srv_fifo_create(&srv_fifos[i], 924 depth, drain_fifo, 925 (caddr_t)private) != DDI_SUCCESS) 926 break; 927 928 if (i < inst_taskqs) 929 goto map_rx_srv_fifos_fail1; 930 931 goto map_rx_srv_fifos_exit; 932 933 map_rx_srv_fifos_fail1: 934 i--; 935 for (; i >= 0; i--) { 936 _ddi_srv_fifo_destroy(srv_fifos[i]); 937 } 938 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 939 srv_fifos = NULL; 940 941 map_rx_srv_fifos_exit: 942 return (srv_fifos); 943 } 944 945 static void 946 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) 947 { 948 int i; 949 950 /* 951 * If this interface was not using service fifos, quickly return. 952 */ 953 if (inst_taskqs == 0) 954 return; 955 956 for (i = 0; i < inst_taskqs; i++) { 957 _ddi_srv_fifo_destroy(srv_fifos[i]); 958 } 959 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 960 } 961 962 /* 963 * Choose between sending up the packet directly and handing off 964 * to a service thread. 965 */ 966 static void 967 ibd_send_up(ibd_state_t *state, mblk_t *mp) 968 { 969 p_srv_fifo_t *srvfifo; 970 ipoib_hdr_t *lhdr; 971 struct ip *ip_hdr; 972 struct udphdr *tran_hdr; 973 uchar_t prot; 974 int tnum = -1, nfifos = state->id_nfifos; 975 976 /* 977 * Quick path if the interface is not using service fifos. 978 */ 979 if (nfifos == 0) { 980 hand_off: 981 IBD_CKSUM_RECV(mp); 982 mac_rx(state->id_mh, NULL, mp); 983 return; 984 } 985 986 /* 987 * Is the packet big enough to look at the IPoIB header 988 * and basic IP header to determine whether it is an 989 * IPv4 packet? 990 */ 991 if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + 992 sizeof (struct ip))) { 993 994 lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); 995 996 /* 997 * Is the packet an IP(v4) packet? 998 */ 999 if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { 1000 1001 ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + 1002 IPOIB_HDRSIZE); 1003 prot = ip_hdr->ip_p; 1004 1005 /* 1006 * TCP or UDP packet? We use the UDP header, since 1007 * the first few words of both headers are laid out 1008 * similarly (src/dest ports). 1009 */ 1010 if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { 1011 1012 tran_hdr = (struct udphdr *)( 1013 (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); 1014 1015 /* 1016 * Are we within limits of this packet? If 1017 * so, use the destination port to hash to 1018 * a service thread. 1019 */ 1020 if (mp->b_wptr >= ((uchar_t *)tran_hdr + 1021 sizeof (*tran_hdr))) 1022 tnum = (ntohs(tran_hdr->uh_dport) + 1023 ntohs(tran_hdr->uh_sport)) % 1024 nfifos; 1025 } 1026 } 1027 } 1028 1029 /* 1030 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the 1031 * packet up in interrupt context, reducing latency. 1032 */ 1033 if (tnum == -1) { 1034 goto hand_off; 1035 } 1036 1037 srvfifo = (p_srv_fifo_t *)state->id_fifos; 1038 if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, 1039 B_TRUE) != DDI_SUCCESS) 1040 freemsg(mp); 1041 } 1042 1043 /* 1044 * Address handle entries maintained by the driver are kept in the 1045 * free and active lists. Each entry starts out in the free list; 1046 * it migrates to the active list when primed using ibt_get_paths() 1047 * and ibt_modify_ud_dest() for transmission to a specific destination. 1048 * In the active list, the entry has a reference count indicating the 1049 * number of ongoing/uncompleted transmits that reference it. The 1050 * entry is left in the active list even after the reference count 1051 * goes to 0, since successive transmits can find it there and do 1052 * not need to set up another entry (ie the path information is 1053 * cached using the active list). Entries on the active list are 1054 * also hashed using the destination link address as a key for faster 1055 * lookups during transmits. 1056 * 1057 * For any destination address (unicast or multicast, whatever the 1058 * join states), there will be at most one entry in the active list. 1059 * Entries with a 0 reference count on the active list can be reused 1060 * for a transmit to a new destination, if the free list is empty. 1061 * 1062 * The AH free list insertion/deletion is protected with the id_ac_mutex, 1063 * since the async thread and Tx callback handlers insert/delete. The 1064 * active list does not need a lock (all operations are done by the 1065 * async thread) but updates to the reference count are atomically 1066 * done (increments done by Tx path, decrements by the Tx callback handler). 1067 */ 1068 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 1069 list_insert_head(&state->id_ah_free, ce) 1070 #define IBD_ACACHE_GET_FREE(state) \ 1071 list_get_head(&state->id_ah_free) 1072 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 1073 int _ret_; \ 1074 list_insert_head(&state->id_ah_active, ce); \ 1075 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 1076 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1077 ASSERT(_ret_ == 0); \ 1078 } 1079 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 1080 list_remove(&state->id_ah_active, ce); \ 1081 (void) mod_hash_remove(state->id_ah_active_hash, \ 1082 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1083 } 1084 #define IBD_ACACHE_GET_ACTIVE(state) \ 1085 list_get_head(&state->id_ah_active) 1086 1087 /* 1088 * Membership states for different mcg's are tracked by two lists: 1089 * the "non" list is used for promiscuous mode, when all mcg traffic 1090 * needs to be inspected. This type of membership is never used for 1091 * transmission, so there can not be an AH in the active list 1092 * corresponding to a member in this list. This list does not need 1093 * any protection, since all operations are performed by the async 1094 * thread. 1095 * 1096 * "Full" and "SendOnly" membership is tracked using a single list, 1097 * the "full" list. This is because this single list can then be 1098 * searched during transmit to a multicast group (if an AH for the 1099 * mcg is not found in the active list), since at least one type 1100 * of membership must be present before initiating the transmit. 1101 * This list is also emptied during driver detach, since sendonly 1102 * membership acquired during transmit is dropped at detach time 1103 * alongwith ipv4 broadcast full membership. Insert/deletes to 1104 * this list are done only by the async thread, but it is also 1105 * searched in program context (see multicast disable case), thus 1106 * the id_mc_mutex protects the list. The driver detach path also 1107 * deconstructs the "full" list, but it ensures that the async 1108 * thread will not be accessing the list (by blocking out mcg 1109 * trap handling and making sure no more Tx reaping will happen). 1110 * 1111 * Currently, an IBA attach is done in the SendOnly case too, 1112 * although this is not required. 1113 */ 1114 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1115 list_insert_head(&state->id_mc_full, mce) 1116 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1117 list_insert_head(&state->id_mc_non, mce) 1118 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1119 ibd_mcache_find(mgid, &state->id_mc_full) 1120 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1121 ibd_mcache_find(mgid, &state->id_mc_non) 1122 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1123 list_remove(&state->id_mc_full, mce) 1124 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1125 list_remove(&state->id_mc_non, mce) 1126 1127 /* 1128 * AH and MCE active list manipulation: 1129 * 1130 * Multicast disable requests and MCG delete traps are two cases 1131 * where the active AH entry for the mcg (if any unreferenced one exists) 1132 * will be moved to the free list (to force the next Tx to the mcg to 1133 * join the MCG in SendOnly mode). Port up handling will also move AHs 1134 * from active to free list. 1135 * 1136 * In the case when some transmits are still pending on an entry 1137 * for an mcg, but a multicast disable has already been issued on the 1138 * mcg, there are some options to consider to preserve the join state 1139 * to ensure the emitted packet is properly routed on the IBA fabric. 1140 * For the AH, we can 1141 * 1. take out of active list at multicast disable time. 1142 * 2. take out of active list only when last pending Tx completes. 1143 * For the MCE, we can 1144 * 3. take out of active list at multicast disable time. 1145 * 4. take out of active list only when last pending Tx completes. 1146 * 5. move from active list to stale list at multicast disable time. 1147 * We choose to use 2,4. We use option 4 so that if a multicast enable 1148 * is tried before the pending Tx completes, the enable code finds the 1149 * mce in the active list and just has to make sure it will not be reaped 1150 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1151 * a stale list (#5) that would be checked in the enable code would need 1152 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1153 * after the multicast disable would try to put an AH in the active list, 1154 * and associate the mce it finds in the active list to this new AH, 1155 * whereas the mce is already associated with the previous AH (taken off 1156 * the active list), and will be removed once the pending Tx's complete 1157 * (unless a reference count on mce's is implemented). One implication of 1158 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1159 * grab new references on the AH, further delaying the leave. 1160 * 1161 * In the case of mcg delete (or create) trap when the port is sendonly 1162 * joined, the AH and MCE handling is different: the AH and MCE has to be 1163 * immediately taken off the active lists (forcing a join and path lookup 1164 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1165 * to an mcg as it is repeatedly created and deleted and goes thru 1166 * reincarnations). 1167 * 1168 * When a port is already sendonly joined, and a multicast enable is 1169 * attempted, the same mce structure is promoted; this ensures only a 1170 * single mce on the active list tracks the most powerful join state. 1171 * 1172 * In the case of port up event handling, the MCE for sendonly membership 1173 * is freed up, and the ACE is put into the free list as soon as possible 1174 * (depending on whether posted Tx's have completed). For fullmembership 1175 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1176 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1177 * done; else the mce is deconstructed (mc_fullreap case). 1178 * 1179 * MCG creation and deletion trap handling: 1180 * 1181 * These traps are unreliable (meaning sometimes the trap might never 1182 * be delivered to the subscribed nodes) and may arrive out-of-order 1183 * since they use UD transport. An alternative to relying on these 1184 * unreliable traps is to poll for mcg presence every so often, but 1185 * instead of doing that, we try to be as conservative as possible 1186 * while handling the traps, and hope that the traps do arrive at 1187 * the subscribed nodes soon. Note that if a node is fullmember 1188 * joined to an mcg, it can not possibly receive a mcg create/delete 1189 * trap for that mcg (by fullmember definition); if it does, it is 1190 * an old trap from a previous incarnation of the mcg. 1191 * 1192 * Whenever a trap is received, the driver cleans up its sendonly 1193 * membership to the group; we choose to do a sendonly leave even 1194 * on a creation trap to handle the case of a prior deletion of the mcg 1195 * having gone unnoticed. Consider an example scenario: 1196 * T1: MCG M is deleted, and fires off deletion trap D1. 1197 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1198 * T3: Node N tries to transmit to M, joining in sendonly mode. 1199 * T4: MCG M is deleted, and fires off deletion trap D2. 1200 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1201 * If the trap is D2, then a LEAVE is not required, since the mcg 1202 * is already deleted; but if it is D1, a LEAVE is required. A safe 1203 * approach is to always LEAVE, but the SM may be confused if it 1204 * receives a LEAVE without a prior JOIN. 1205 * 1206 * Management of the non-membership to an mcg is similar to the above, 1207 * except that if the interface is in promiscuous mode, it is required 1208 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1209 * if the re-join attempt fails (in which case a warning message needs 1210 * to be printed), it is not clear whether it failed due to the mcg not 1211 * existing, or some fabric/hca issues, due to the delayed nature of 1212 * trap delivery. Querying the SA to establish presence/absence of the 1213 * mcg is also racy at best. Thus, the driver just prints a warning 1214 * message when it can not rejoin after receiving a create trap, although 1215 * this might be (on rare occassions) a mis-warning if the create trap is 1216 * received after the mcg was deleted. 1217 */ 1218 1219 /* 1220 * Implementation of atomic "recycle" bits and reference count 1221 * on address handles. This utilizes the fact that max reference 1222 * count on any handle is limited by number of send wqes, thus 1223 * high bits in the ac_ref field can be used as the recycle bits, 1224 * and only the low bits hold the number of pending Tx requests. 1225 * This atomic AH reference counting allows the Tx completion 1226 * handler not to acquire the id_ac_mutex to process every completion, 1227 * thus reducing lock contention problems between completion and 1228 * the Tx path. 1229 */ 1230 #define CYCLEVAL 0x80000 1231 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1232 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1233 #define GET_REF(ace) ((ace)->ac_ref) 1234 #define GET_REF_CYCLE(ace) ( \ 1235 /* \ 1236 * Make sure "cycle" bit is set. \ 1237 */ \ 1238 ASSERT(CYCLE_SET(ace)), \ 1239 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1240 ) 1241 #define INC_REF(ace, num) { \ 1242 atomic_add_32(&(ace)->ac_ref, num); \ 1243 } 1244 #define SET_CYCLE_IF_REF(ace) ( \ 1245 CYCLE_SET(ace) ? B_TRUE : \ 1246 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1247 CYCLEVAL ? \ 1248 /* \ 1249 * Clear the "cycle" bit we just set; \ 1250 * ref count known to be 0 from above. \ 1251 */ \ 1252 CLEAR_REFCYCLE(ace), B_FALSE : \ 1253 /* \ 1254 * We set "cycle" bit; let caller know. \ 1255 */ \ 1256 B_TRUE \ 1257 ) 1258 #define DEC_REF_DO_CYCLE(ace) ( \ 1259 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1260 CYCLEVAL ? \ 1261 /* \ 1262 * Ref count known to be 0 from above. \ 1263 */ \ 1264 B_TRUE : \ 1265 B_FALSE \ 1266 ) 1267 1268 static void * 1269 list_get_head(list_t *list) 1270 { 1271 list_node_t *lhead = list_head(list); 1272 1273 if (lhead != NULL) 1274 list_remove(list, lhead); 1275 return (lhead); 1276 } 1277 1278 /* 1279 * This is always guaranteed to be able to queue the work. 1280 */ 1281 static void 1282 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1283 { 1284 /* Initialize request */ 1285 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1286 ptr->rq_op = op; 1287 1288 /* 1289 * Queue provided slot onto request pool. 1290 */ 1291 mutex_enter(&state->id_acache_req_lock); 1292 list_insert_tail(&state->id_req_list, ptr); 1293 1294 /* Go, fetch, async thread */ 1295 cv_signal(&state->id_acache_req_cv); 1296 mutex_exit(&state->id_acache_req_lock); 1297 } 1298 1299 /* 1300 * Main body of the per interface async thread. 1301 */ 1302 static void 1303 ibd_async_work(ibd_state_t *state) 1304 { 1305 ibd_req_t *ptr; 1306 callb_cpr_t cprinfo; 1307 1308 mutex_enter(&state->id_acache_req_lock); 1309 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1310 callb_generic_cpr, "ibd_async_work"); 1311 for (;;) { 1312 ptr = list_get_head(&state->id_req_list); 1313 if (ptr != NULL) { 1314 mutex_exit(&state->id_acache_req_lock); 1315 1316 /* 1317 * Once we have done the operation, there is no 1318 * guarantee the request slot is going to be valid, 1319 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). 1320 */ 1321 1322 /* Perform the request */ 1323 switch (ptr->rq_op) { 1324 case ASYNC_GETAH: 1325 ibd_async_acache(state, &ptr->rq_mac); 1326 break; 1327 case ASYNC_REAP: 1328 ibd_async_reap_group(state, 1329 ptr->rq_ptr, ptr->rq_gid, 1330 IB_MC_JSTATE_FULL); 1331 /* 1332 * the req buf contains in mce 1333 * structure, so we do not need 1334 * to free it here. 1335 */ 1336 ptr = NULL; 1337 break; 1338 case ASYNC_LEAVE: 1339 case ASYNC_JOIN: 1340 ibd_async_multicast(state, 1341 ptr->rq_gid, ptr->rq_op); 1342 break; 1343 case ASYNC_PROMON: 1344 ibd_async_setprom(state); 1345 break; 1346 case ASYNC_PROMOFF: 1347 ibd_async_unsetprom(state); 1348 break; 1349 case ASYNC_TRAP: 1350 ibd_async_trap(state, ptr); 1351 break; 1352 case ASYNC_SCHED: 1353 ibd_async_txsched(state); 1354 break; 1355 case ASYNC_LINK: 1356 ibd_async_link(state, ptr); 1357 break; 1358 case ASYNC_EXIT: 1359 mutex_enter(&state->id_acache_req_lock); 1360 #ifndef __lock_lint 1361 CALLB_CPR_EXIT(&cprinfo); 1362 #endif /* !__lock_lint */ 1363 return; 1364 } 1365 if (ptr != NULL) 1366 kmem_cache_free(state->id_req_kmc, ptr); 1367 1368 mutex_enter(&state->id_acache_req_lock); 1369 } else { 1370 /* 1371 * Nothing to do: wait till new request arrives. 1372 */ 1373 #ifndef __lock_lint 1374 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1375 cv_wait(&state->id_acache_req_cv, 1376 &state->id_acache_req_lock); 1377 CALLB_CPR_SAFE_END(&cprinfo, 1378 &state->id_acache_req_lock); 1379 #endif /* !_lock_lint */ 1380 } 1381 } 1382 /*NOTREACHED*/ 1383 _NOTE(NOT_REACHED) 1384 } 1385 1386 /* 1387 * Return when it is safe to queue requests to the async daemon; primarily 1388 * for subnet trap and async event handling. Disallow requests before the 1389 * daemon is created, and when interface deinitilization starts. 1390 */ 1391 static boolean_t 1392 ibd_async_safe(ibd_state_t *state) 1393 { 1394 mutex_enter(&state->id_trap_lock); 1395 if (state->id_trap_stop) { 1396 mutex_exit(&state->id_trap_lock); 1397 return (B_FALSE); 1398 } 1399 state->id_trap_inprog++; 1400 mutex_exit(&state->id_trap_lock); 1401 return (B_TRUE); 1402 } 1403 1404 /* 1405 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1406 * trap or event handling to complete to kill the async thread and deconstruct 1407 * the mcg/ace list. 1408 */ 1409 static void 1410 ibd_async_done(ibd_state_t *state) 1411 { 1412 mutex_enter(&state->id_trap_lock); 1413 if (--state->id_trap_inprog == 0) 1414 cv_signal(&state->id_trap_cv); 1415 mutex_exit(&state->id_trap_lock); 1416 } 1417 1418 /* 1419 * Hash functions: 1420 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1421 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1422 * These operate on mac addresses input into ibd_send, but there is no 1423 * guarantee on the alignment of the ipoib_mac_t structure. 1424 */ 1425 /*ARGSUSED*/ 1426 static uint_t 1427 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1428 { 1429 ulong_t ptraddr = (ulong_t)key; 1430 uint_t hval; 1431 1432 /* 1433 * If the input address is 4 byte aligned, we can just dereference 1434 * it. This is most common, since IP will send in a 4 byte aligned 1435 * IP header, which implies the 24 byte IPoIB psuedo header will be 1436 * 4 byte aligned too. 1437 */ 1438 if ((ptraddr & 3) == 0) 1439 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1440 1441 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1442 return (hval); 1443 } 1444 1445 static int 1446 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1447 { 1448 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1449 return (0); 1450 else 1451 return (1); 1452 } 1453 1454 /* 1455 * Initialize all the per interface caches and lists; AH cache, 1456 * MCG list etc. 1457 */ 1458 static int 1459 ibd_acache_init(ibd_state_t *state) 1460 { 1461 ibd_ace_t *ce; 1462 int i; 1463 1464 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1465 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1466 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1467 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1468 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1469 offsetof(ibd_ace_t, ac_list)); 1470 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1471 offsetof(ibd_ace_t, ac_list)); 1472 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1473 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1474 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1475 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1476 offsetof(ibd_mce_t, mc_list)); 1477 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1478 offsetof(ibd_mce_t, mc_list)); 1479 list_create(&state->id_req_list, sizeof (ibd_req_t), 1480 offsetof(ibd_req_t, rq_list)); 1481 1482 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1483 IBD_NUM_AH, KM_SLEEP); 1484 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1485 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1486 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1487 ibd_acache_fini(state); 1488 return (DDI_FAILURE); 1489 } else { 1490 CLEAR_REFCYCLE(ce); 1491 ce->ac_mce = NULL; 1492 IBD_ACACHE_INSERT_FREE(state, ce); 1493 } 1494 } 1495 return (DDI_SUCCESS); 1496 } 1497 1498 static void 1499 ibd_acache_fini(ibd_state_t *state) 1500 { 1501 ibd_ace_t *ptr; 1502 1503 mutex_enter(&state->id_ac_mutex); 1504 1505 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1506 ASSERT(GET_REF(ptr) == 0); 1507 (void) ibt_free_ud_dest(ptr->ac_dest); 1508 } 1509 1510 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1511 ASSERT(GET_REF(ptr) == 0); 1512 (void) ibt_free_ud_dest(ptr->ac_dest); 1513 } 1514 1515 list_destroy(&state->id_ah_free); 1516 list_destroy(&state->id_ah_active); 1517 list_destroy(&state->id_mc_full); 1518 list_destroy(&state->id_mc_non); 1519 list_destroy(&state->id_req_list); 1520 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1521 mutex_exit(&state->id_ac_mutex); 1522 mutex_destroy(&state->id_ac_mutex); 1523 mutex_destroy(&state->id_mc_mutex); 1524 mutex_destroy(&state->id_acache_req_lock); 1525 cv_destroy(&state->id_acache_req_cv); 1526 } 1527 1528 /* 1529 * Search AH active hash list for a cached path to input destination. 1530 * If we are "just looking", hold == F. When we are in the Tx path, 1531 * we set hold == T to grab a reference on the AH so that it can not 1532 * be recycled to a new destination while the Tx request is posted. 1533 */ 1534 static ibd_ace_t * 1535 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1536 { 1537 ibd_ace_t *ptr; 1538 1539 ASSERT(mutex_owned(&state->id_ac_mutex)); 1540 1541 /* 1542 * Do hash search. 1543 */ 1544 if (mod_hash_find(state->id_ah_active_hash, 1545 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1546 if (hold) 1547 INC_REF(ptr, num); 1548 return (ptr); 1549 } 1550 return (NULL); 1551 } 1552 1553 /* 1554 * This is called by the tx side; if an initialized AH is found in 1555 * the active list, it is locked down and can be used; if no entry 1556 * is found, an async request is queued to do path resolution. 1557 */ 1558 static ibd_ace_t * 1559 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1560 { 1561 ibd_ace_t *ptr; 1562 ibd_req_t *req; 1563 1564 /* 1565 * Only attempt to print when we can; in the mdt pattr case, the 1566 * address is not aligned properly. 1567 */ 1568 if (((ulong_t)mac & 3) == 0) 1569 DPRINT(4, 1570 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1571 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1572 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1573 htonl(mac->ipoib_gidsuff[1])); 1574 1575 mutex_enter(&state->id_ac_mutex); 1576 1577 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1578 mutex_exit(&state->id_ac_mutex); 1579 return (ptr); 1580 } 1581 1582 /* 1583 * Implementation of a single outstanding async request; if 1584 * the operation is not started yet, queue a request and move 1585 * to ongoing state. Remember in id_ah_addr for which address 1586 * we are queueing the request, in case we need to flag an error; 1587 * Any further requests, for the same or different address, until 1588 * the operation completes, is sent back to GLDv3 to be retried. 1589 * The async thread will update id_ah_op with an error indication 1590 * or will set it to indicate the next look up can start; either 1591 * way, it will mac_tx_update() so that all blocked requests come 1592 * back here. 1593 */ 1594 *err = EAGAIN; 1595 if (state->id_ah_op == NOTSTARTED) { 1596 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1597 if (req != NULL) { 1598 /* 1599 * We did not even find the entry; queue a request 1600 * for it. 1601 */ 1602 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1603 ibd_queue_work_slot(state, req, ASYNC_GETAH); 1604 state->id_ah_op = ONGOING; 1605 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1606 } 1607 } else if ((state->id_ah_op != ONGOING) && 1608 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1609 /* 1610 * Check the status of the pathrecord lookup request 1611 * we had queued before. 1612 */ 1613 if (state->id_ah_op == ERRORED) { 1614 *err = EFAULT; 1615 state->id_ah_error++; 1616 } else { 1617 /* 1618 * ROUTERED case: We need to send to the 1619 * all-router MCG. If we can find the AH for 1620 * the mcg, the Tx will be attempted. If we 1621 * do not find the AH, we return NORESOURCES 1622 * to retry. 1623 */ 1624 ipoib_mac_t routermac; 1625 1626 (void) ibd_get_allroutergroup(state, mac, &routermac); 1627 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1628 numwqe); 1629 } 1630 state->id_ah_op = NOTSTARTED; 1631 } else if ((state->id_ah_op != ONGOING) && 1632 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1633 /* 1634 * This case can happen when we get a higher band 1635 * packet. The easiest way is to reset the state machine 1636 * to accommodate the higher priority packet. 1637 */ 1638 state->id_ah_op = NOTSTARTED; 1639 } 1640 mutex_exit(&state->id_ac_mutex); 1641 1642 return (ptr); 1643 } 1644 1645 /* 1646 * Grab a not-currently-in-use AH/PathRecord from the active 1647 * list to recycle to a new destination. Only the async thread 1648 * executes this code. 1649 */ 1650 static ibd_ace_t * 1651 ibd_acache_get_unref(ibd_state_t *state) 1652 { 1653 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1654 1655 ASSERT(mutex_owned(&state->id_ac_mutex)); 1656 1657 /* 1658 * Do plain linear search. 1659 */ 1660 while (ptr != NULL) { 1661 /* 1662 * Note that it is possible that the "cycle" bit 1663 * is set on the AH w/o any reference count. The 1664 * mcg must have been deleted, and the tx cleanup 1665 * just decremented the reference count to 0, but 1666 * hasn't gotten around to grabbing the id_ac_mutex 1667 * to move the AH into the free list. 1668 */ 1669 if (GET_REF(ptr) == 0) { 1670 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1671 break; 1672 } 1673 ptr = list_next(&state->id_ah_active, ptr); 1674 } 1675 return (ptr); 1676 } 1677 1678 /* 1679 * Invoked to clean up AH from active list in case of multicast 1680 * disable and to handle sendonly memberships during mcg traps. 1681 * And for port up processing for multicast and unicast AHs. 1682 * Normally, the AH is taken off the active list, and put into 1683 * the free list to be recycled for a new destination. In case 1684 * Tx requests on the AH have not completed yet, the AH is marked 1685 * for reaping (which will put the AH on the free list) once the Tx's 1686 * complete; in this case, depending on the "force" input, we take 1687 * out the AH from the active list right now, or leave it also for 1688 * the reap operation. Returns TRUE if the AH is taken off the active 1689 * list (and either put into the free list right now, or arranged for 1690 * later), FALSE otherwise. 1691 */ 1692 static boolean_t 1693 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1694 { 1695 ibd_ace_t *acactive; 1696 boolean_t ret = B_TRUE; 1697 1698 ASSERT(mutex_owned(&state->id_ac_mutex)); 1699 1700 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1701 1702 /* 1703 * Note that the AH might already have the cycle bit set 1704 * on it; this might happen if sequences of multicast 1705 * enables and disables are coming so fast, that posted 1706 * Tx's to the mcg have not completed yet, and the cycle 1707 * bit is set successively by each multicast disable. 1708 */ 1709 if (SET_CYCLE_IF_REF(acactive)) { 1710 if (!force) { 1711 /* 1712 * The ace is kept on the active list, further 1713 * Tx's can still grab a reference on it; the 1714 * ace is reaped when all pending Tx's 1715 * referencing the AH complete. 1716 */ 1717 ret = B_FALSE; 1718 } else { 1719 /* 1720 * In the mcg trap case, we always pull the 1721 * AH from the active list. And also the port 1722 * up multi/unicast case. 1723 */ 1724 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1725 acactive->ac_mce = NULL; 1726 } 1727 } else { 1728 /* 1729 * Determined the ref count is 0, thus reclaim 1730 * immediately after pulling out the ace from 1731 * the active list. 1732 */ 1733 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1734 acactive->ac_mce = NULL; 1735 IBD_ACACHE_INSERT_FREE(state, acactive); 1736 } 1737 1738 } 1739 return (ret); 1740 } 1741 1742 /* 1743 * Helper function for async path record lookup. If we are trying to 1744 * Tx to a MCG, check our membership, possibly trying to join the 1745 * group if required. If that fails, try to send the packet to the 1746 * all router group (indicated by the redirect output), pointing 1747 * the input mac address to the router mcg address. 1748 */ 1749 static ibd_mce_t * 1750 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1751 { 1752 ib_gid_t mgid; 1753 ibd_mce_t *mce; 1754 ipoib_mac_t routermac; 1755 1756 *redirect = B_FALSE; 1757 ibd_n2h_gid(mac, &mgid); 1758 1759 /* 1760 * Check the FullMember+SendOnlyNonMember list. 1761 * Since we are the only one who manipulates the 1762 * id_mc_full list, no locks are needed. 1763 */ 1764 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1765 if (mce != NULL) { 1766 DPRINT(4, "ibd_async_mcache : already joined to group"); 1767 return (mce); 1768 } 1769 1770 /* 1771 * Not found; try to join(SendOnlyNonMember) and attach. 1772 */ 1773 DPRINT(4, "ibd_async_mcache : not joined to group"); 1774 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1775 NULL) { 1776 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1777 return (mce); 1778 } 1779 1780 /* 1781 * MCGroup not present; try to join the all-router group. If 1782 * any of the following steps succeed, we will be redirecting 1783 * to the all router group. 1784 */ 1785 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1786 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1787 return (NULL); 1788 *redirect = B_TRUE; 1789 ibd_n2h_gid(&routermac, &mgid); 1790 bcopy(&routermac, mac, IPOIB_ADDRL); 1791 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1792 mgid.gid_prefix, mgid.gid_guid); 1793 1794 /* 1795 * Are we already joined to the router group? 1796 */ 1797 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1798 DPRINT(4, "ibd_async_mcache : using already joined router" 1799 "group\n"); 1800 return (mce); 1801 } 1802 1803 /* 1804 * Can we join(SendOnlyNonMember) the router group? 1805 */ 1806 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1807 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1808 NULL) { 1809 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1810 return (mce); 1811 } 1812 1813 return (NULL); 1814 } 1815 1816 /* 1817 * Async path record lookup code. 1818 */ 1819 static void 1820 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1821 { 1822 ibd_ace_t *ce; 1823 ibd_mce_t *mce = NULL; 1824 ibt_path_attr_t path_attr; 1825 ibt_path_info_t path_info; 1826 ib_gid_t destgid; 1827 int ret = NOTSTARTED; 1828 1829 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1830 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1831 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1832 htonl(mac->ipoib_gidsuff[1])); 1833 1834 /* 1835 * Check whether we are trying to transmit to a MCG. 1836 * In that case, we need to make sure we are a member of 1837 * the MCG. 1838 */ 1839 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1840 boolean_t redirected; 1841 1842 /* 1843 * If we can not find or join the group or even 1844 * redirect, error out. 1845 */ 1846 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1847 NULL) { 1848 state->id_ah_op = ERRORED; 1849 return; 1850 } 1851 1852 /* 1853 * If we got redirected, we need to determine whether 1854 * the AH for the new mcg is in the cache already, and 1855 * not pull it in then; otherwise proceed to get the 1856 * path for the new mcg. There is no guarantee that 1857 * if the AH is currently in the cache, it will still be 1858 * there when we look in ibd_acache_lookup(), but that's 1859 * okay, we will come back here. 1860 */ 1861 if (redirected) { 1862 ret = ROUTERED; 1863 DPRINT(4, "ibd_async_acache : redirected to " 1864 "%08X:%08X:%08X:%08X:%08X", 1865 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1866 htonl(mac->ipoib_gidpref[1]), 1867 htonl(mac->ipoib_gidsuff[0]), 1868 htonl(mac->ipoib_gidsuff[1])); 1869 1870 mutex_enter(&state->id_ac_mutex); 1871 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1872 mutex_exit(&state->id_ac_mutex); 1873 DPRINT(4, "ibd_async_acache : router AH found"); 1874 state->id_ah_op = ROUTERED; 1875 return; 1876 } 1877 mutex_exit(&state->id_ac_mutex); 1878 } 1879 } 1880 1881 /* 1882 * Get an AH from the free list. 1883 */ 1884 mutex_enter(&state->id_ac_mutex); 1885 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1886 /* 1887 * No free ones; try to grab an unreferenced active 1888 * one. Maybe we need to make the active list LRU, 1889 * but that will create more work for Tx callbacks. 1890 * Is there a way of not having to pull out the 1891 * entry from the active list, but just indicate it 1892 * is being recycled? Yes, but that creates one more 1893 * check in the fast lookup path. 1894 */ 1895 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1896 /* 1897 * Pretty serious shortage now. 1898 */ 1899 state->id_ah_op = NOTSTARTED; 1900 mutex_exit(&state->id_ac_mutex); 1901 DPRINT(10, "ibd_async_acache : failed to find AH " 1902 "slot\n"); 1903 return; 1904 } 1905 /* 1906 * We could check whether ac_mce points to a SendOnly 1907 * member and drop that membership now. Or do it lazily 1908 * at detach time. 1909 */ 1910 ce->ac_mce = NULL; 1911 } 1912 mutex_exit(&state->id_ac_mutex); 1913 ASSERT(ce->ac_mce == NULL); 1914 1915 /* 1916 * Update the entry. 1917 */ 1918 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1919 1920 bzero(&path_info, sizeof (path_info)); 1921 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1922 path_attr.pa_sgid = state->id_sgid; 1923 path_attr.pa_num_dgids = 1; 1924 ibd_n2h_gid(&ce->ac_mac, &destgid); 1925 path_attr.pa_dgids = &destgid; 1926 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1927 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1928 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1929 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1930 goto error; 1931 } 1932 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1933 ntohl(ce->ac_mac.ipoib_qpn), 1934 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1935 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1936 goto error; 1937 } 1938 1939 /* 1940 * mce is set whenever an AH is being associated with a 1941 * MCG; this will come in handy when we leave the MCG. The 1942 * lock protects Tx fastpath from scanning the active list. 1943 */ 1944 if (mce != NULL) 1945 ce->ac_mce = mce; 1946 mutex_enter(&state->id_ac_mutex); 1947 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1948 state->id_ah_op = ret; 1949 mutex_exit(&state->id_ac_mutex); 1950 return; 1951 error: 1952 /* 1953 * We might want to drop SendOnly membership here if we 1954 * joined above. The lock protects Tx callbacks inserting 1955 * into the free list. 1956 */ 1957 mutex_enter(&state->id_ac_mutex); 1958 state->id_ah_op = ERRORED; 1959 IBD_ACACHE_INSERT_FREE(state, ce); 1960 mutex_exit(&state->id_ac_mutex); 1961 } 1962 1963 /* 1964 * While restoring port's presence on the subnet on a port up, it is possible 1965 * that the port goes down again. 1966 */ 1967 static void 1968 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1969 { 1970 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1971 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1972 LINK_STATE_UP; 1973 ibd_mce_t *mce, *pmce; 1974 ibd_ace_t *ace, *pace; 1975 1976 DPRINT(10, "ibd_async_link(): %d", opcode); 1977 1978 /* 1979 * On a link up, revalidate the link speed/width. No point doing 1980 * this on a link down, since we will be unable to do SA operations, 1981 * defaulting to the lowest speed. Also notice that we update our 1982 * notion of speed before calling mac_link_update(), which will do 1983 * neccesary higher level notifications for speed changes. 1984 */ 1985 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1986 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1987 state->id_link_speed = ibd_get_portspeed(state); 1988 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1989 } 1990 1991 /* 1992 * Do all the work required to establish our presence on 1993 * the subnet. 1994 */ 1995 if (opcode == IBD_LINK_UP_ABSENT) { 1996 /* 1997 * If in promiscuous mode ... 1998 */ 1999 if (state->id_prom_op == COMPLETED) { 2000 /* 2001 * Drop all nonmembership. 2002 */ 2003 ibd_async_unsetprom(state); 2004 2005 /* 2006 * Then, try to regain nonmembership to all mcg's. 2007 */ 2008 ibd_async_setprom(state); 2009 2010 } 2011 2012 /* 2013 * Drop all sendonly membership (which also gets rid of the 2014 * AHs); try to reacquire all full membership. 2015 */ 2016 mce = list_head(&state->id_mc_full); 2017 while ((pmce = mce) != NULL) { 2018 mce = list_next(&state->id_mc_full, mce); 2019 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 2020 ibd_leave_group(state, 2021 pmce->mc_info.mc_adds_vect.av_dgid, 2022 IB_MC_JSTATE_SEND_ONLY_NON); 2023 else 2024 ibd_reacquire_group(state, pmce); 2025 } 2026 2027 /* 2028 * Recycle all active AHs to free list (and if there are 2029 * pending posts, make sure they will go into the free list 2030 * once the Tx's complete). Grab the lock to prevent 2031 * concurrent Tx's as well as Tx cleanups. 2032 */ 2033 mutex_enter(&state->id_ac_mutex); 2034 ace = list_head(&state->id_ah_active); 2035 while ((pace = ace) != NULL) { 2036 boolean_t cycled; 2037 2038 ace = list_next(&state->id_ah_active, ace); 2039 mce = pace->ac_mce; 2040 cycled = ibd_acache_recycle(state, &pace->ac_mac, 2041 B_TRUE); 2042 /* 2043 * If this is for an mcg, it must be for a fullmember, 2044 * since we got rid of send-only members above when 2045 * processing the mce list. 2046 */ 2047 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2048 IB_MC_JSTATE_FULL))); 2049 2050 /* 2051 * Check if the fullmember mce needs to be torn down, 2052 * ie whether the DLPI disable has already been done. 2053 * If so, do some of the work of tx_cleanup, namely 2054 * causing leave (which will fail), detach and 2055 * mce-freeing. tx_cleanup will put the AH into free 2056 * list. The reason to duplicate some of this 2057 * tx_cleanup work is because we want to delete the 2058 * AH right now instead of waiting for tx_cleanup, to 2059 * force subsequent Tx's to reacquire an AH. 2060 */ 2061 if ((mce != NULL) && (mce->mc_fullreap)) 2062 ibd_async_reap_group(state, mce, 2063 mce->mc_info.mc_adds_vect.av_dgid, 2064 mce->mc_jstate); 2065 } 2066 mutex_exit(&state->id_ac_mutex); 2067 } 2068 2069 /* 2070 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2071 * (which stops further events from being delivered) before 2072 * mac_unreigster(). At this point, it is guaranteed that mac_register 2073 * has already been done. 2074 */ 2075 mutex_enter(&state->id_link_mutex); 2076 state->id_link_state = lstate; 2077 mac_link_update(state->id_mh, lstate); 2078 mutex_exit(&state->id_link_mutex); 2079 2080 ibd_async_done(state); 2081 } 2082 2083 /* 2084 * When the link is notified up, we need to do a few things, based 2085 * on the port's current p_init_type_reply claiming a reinit has been 2086 * done or not. The reinit steps are: 2087 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2088 * the old Pkey and GID0 are correct. 2089 * 2. Register for mcg traps (already done by ibmf). 2090 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2091 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2092 * 4. Give up all sendonly memberships. 2093 * 5. Acquire all full memberships. 2094 * 6. In promiscuous mode, acquire all non memberships. 2095 * 7. Recycle all AHs to free list. 2096 */ 2097 static void 2098 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2099 { 2100 ibt_hca_portinfo_t *port_infop; 2101 ibt_status_t ibt_status; 2102 uint_t psize, port_infosz; 2103 ibd_link_op_t opcode; 2104 ibd_req_t *req; 2105 2106 /* 2107 * Do not send a request to the async daemon if it has not 2108 * yet been created or is being destroyed. If the async 2109 * daemon has not yet been created, we still need to track 2110 * last known state of the link. If this code races with the 2111 * detach path, then we are assured that the detach path has 2112 * not yet done the ibt_close_hca (which waits for all async 2113 * events to complete). If the code races with the attach path, 2114 * we need to validate the pkey/gid (in the link_up case) if 2115 * the initialization path has already set these up and created 2116 * IBTF resources based on the values. 2117 */ 2118 mutex_enter(&state->id_link_mutex); 2119 2120 /* 2121 * If the init code in ibd_drv_init hasn't yet set up the 2122 * pkey/gid, nothing to do; that code will set the link state. 2123 */ 2124 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2125 mutex_exit(&state->id_link_mutex); 2126 return; 2127 } 2128 2129 if (code == IBT_EVENT_PORT_UP) { 2130 uint8_t itreply; 2131 boolean_t badup = B_FALSE; 2132 2133 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2134 state->id_port, &port_infop, &psize, &port_infosz); 2135 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2136 mutex_exit(&state->id_link_mutex); 2137 DPRINT(10, "ibd_link_up : failed in" 2138 " ibt_query_port()\n"); 2139 return; 2140 } 2141 2142 /* 2143 * If the link already went down by the time the handler gets 2144 * here, give up; we can not even validate pkey/gid since those 2145 * are not valid. 2146 */ 2147 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2148 badup = B_TRUE; 2149 2150 itreply = port_infop->p_init_type_reply; 2151 2152 /* 2153 * In InitTypeReply, check if NoLoadReply == 2154 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2155 */ 2156 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2157 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2158 (!badup)) { 2159 /* 2160 * Check that the subnet part of GID0 has not changed. 2161 */ 2162 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2163 sizeof (ib_gid_t)) != 0) 2164 badup = B_TRUE; 2165 2166 /* 2167 * Check that Pkey/index mapping is still valid. 2168 */ 2169 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2170 (port_infop->p_pkey_tbl[state->id_pkix] != 2171 state->id_pkey)) 2172 badup = B_TRUE; 2173 } 2174 2175 /* 2176 * In InitTypeReply, if PreservePresenceReply indicates the SM 2177 * has ensured that the port's presence in mcg, traps etc is 2178 * intact, nothing more to do. 2179 */ 2180 opcode = IBD_LINK_UP_ABSENT; 2181 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2182 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2183 opcode = IBD_LINK_UP; 2184 2185 if (badup) 2186 code = IBT_ERROR_PORT_DOWN; 2187 ibt_free_portinfo(port_infop, port_infosz); 2188 } 2189 2190 if (!ibd_async_safe(state)) { 2191 state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? 2192 LINK_STATE_UP : LINK_STATE_DOWN); 2193 mutex_exit(&state->id_link_mutex); 2194 return; 2195 } 2196 mutex_exit(&state->id_link_mutex); 2197 2198 if (code == IBT_ERROR_PORT_DOWN) 2199 opcode = IBD_LINK_DOWN; 2200 2201 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2202 req->rq_ptr = (void *)opcode; 2203 ibd_queue_work_slot(state, req, ASYNC_LINK); 2204 } 2205 2206 /* 2207 * For the port up/down events, IBTL guarantees there will not be concurrent 2208 * invocations of the handler. IBTL might coalesce link transition events, 2209 * and not invoke the handler for _each_ up/down transition, but it will 2210 * invoke the handler with last known state 2211 */ 2212 static void 2213 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2214 ibt_async_code_t code, ibt_async_event_t *event) 2215 { 2216 ibd_state_t *state = (ibd_state_t *)clnt_private; 2217 2218 switch (code) { 2219 case IBT_ERROR_CATASTROPHIC_CHAN: 2220 ibd_print_warn(state, "catastrophic channel error"); 2221 break; 2222 case IBT_ERROR_CQ: 2223 ibd_print_warn(state, "completion queue error"); 2224 break; 2225 case IBT_ERROR_PORT_DOWN: 2226 case IBT_EVENT_PORT_UP: 2227 /* 2228 * Events will be delivered to all instances that have 2229 * done ibt_open_hca() but not yet done ibt_close_hca(). 2230 * Only need to do work for our port; IBTF will deliver 2231 * events for other ports on the hca we have ibt_open_hca'ed 2232 * too. Note that ibd_drv_init() initializes id_port before 2233 * doing ibt_open_hca(). 2234 */ 2235 ASSERT(state->id_hca_hdl == hca_hdl); 2236 if (state->id_port != event->ev_port) 2237 break; 2238 2239 ibd_link_mod(state, code); 2240 break; 2241 2242 case IBT_HCA_ATTACH_EVENT: 2243 case IBT_HCA_DETACH_EVENT: 2244 /* 2245 * When a new card is plugged to the system, attach_event is 2246 * invoked. Additionally, a cfgadm needs to be run to make the 2247 * card known to the system, and an ifconfig needs to be run to 2248 * plumb up any ibd interfaces on the card. In the case of card 2249 * unplug, a cfgadm is run that will trigger any RCM scripts to 2250 * unplumb the ibd interfaces on the card; when the card is 2251 * actually unplugged, the detach_event is invoked; 2252 * additionally, if any ibd instances are still active on the 2253 * card (eg there were no associated RCM scripts), driver's 2254 * detach routine is invoked. 2255 */ 2256 break; 2257 default: 2258 break; 2259 } 2260 } 2261 2262 /* 2263 * Attach device to the IO framework. 2264 */ 2265 static int 2266 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2267 { 2268 mac_register_t *macp; 2269 ibd_state_t *state; 2270 int instance; 2271 int err; 2272 2273 switch (cmd) { 2274 case DDI_ATTACH: 2275 break; 2276 case DDI_RESUME: 2277 /* This driver does not support resume */ 2278 default: 2279 return (DDI_FAILURE); 2280 } 2281 2282 /* 2283 * Allocate soft device data structure 2284 */ 2285 instance = ddi_get_instance(dip); 2286 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2287 return (DDI_FAILURE); 2288 state = ddi_get_soft_state(ibd_list, instance); 2289 2290 /* pre ibt_attach() soft state initialization */ 2291 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2292 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2293 goto attach_fail_state_init; 2294 } 2295 2296 /* alloc rx soft intr */ 2297 if ((ibd_rx_softintr == 1) && 2298 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2299 NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { 2300 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2301 goto attach_fail_ddi_add_rx_softintr; 2302 } 2303 2304 /* alloc tx soft intr */ 2305 if ((ibd_tx_softintr == 1) && 2306 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2307 NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { 2308 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2309 goto attach_fail_ddi_add_tx_softintr; 2310 } 2311 2312 /* "attach" to IBTL */ 2313 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2314 &state->id_ibt_hdl) != IBT_SUCCESS) { 2315 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2316 goto attach_fail_ibt_attach; 2317 } 2318 2319 /* Finish initializing this driver */ 2320 if (ibd_drv_init(state) != DDI_SUCCESS) { 2321 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2322 goto attach_fail_drv_init; 2323 } 2324 2325 /* 2326 * Initialize pointers to device specific functions which will be 2327 * used by the generic layer. 2328 */ 2329 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2330 DPRINT(10, "ibd_attach : failed in mac_alloc()"); 2331 goto attach_fail_drv_init; 2332 } 2333 2334 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2335 macp->m_driver = state; 2336 macp->m_dip = state->id_dip; 2337 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2338 macp->m_callbacks = &ib_m_callbacks; 2339 macp->m_min_sdu = 0; 2340 macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; 2341 2342 /* 2343 * Register ourselves with the GLDv3 interface 2344 */ 2345 err = mac_register(macp, &state->id_mh); 2346 mac_free(macp); 2347 if (err != 0) { 2348 DPRINT(10, "ibd_attach : failed in mac_register()"); 2349 goto attach_fail_mac_register; 2350 } 2351 2352 /* 2353 * Setup the handler we will use for regular DLPI stuff. Its important 2354 * to setup the recv handler after registering with gldv3. 2355 */ 2356 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2357 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2358 IBT_SUCCESS) { 2359 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2360 goto attach_fail_setup_handler; 2361 } 2362 2363 /* 2364 * Setup the subnet notices handler after we initialize the a/mcaches 2365 * and start the async thread, both of which are required for the 2366 * trap handler to function properly. Enable the trap handler to 2367 * queue requests to the async thread after the mac_register, because 2368 * the async daemon invokes mac_tx_update(), which must be done after 2369 * mac_register(). 2370 */ 2371 ibt_register_subnet_notices(state->id_ibt_hdl, 2372 ibd_snet_notices_handler, state); 2373 mutex_enter(&state->id_trap_lock); 2374 state->id_trap_stop = B_FALSE; 2375 mutex_exit(&state->id_trap_lock); 2376 2377 /* 2378 * Indicate link status to GLDv3 and higher layers. By default, 2379 * we assume we are in up state (which must have been true at 2380 * least at the time the broadcast mcg's were probed); if there 2381 * were any up/down transitions till the time we come here, the 2382 * async handler will have updated last known state, which we 2383 * use to tell GLDv3. The async handler will not send any 2384 * notifications to GLDv3 till we reach here in the initialization 2385 * sequence. 2386 */ 2387 mac_link_update(state->id_mh, state->id_link_state); 2388 2389 return (DDI_SUCCESS); 2390 2391 /* Attach failure points, cleanup */ 2392 attach_fail_setup_handler: 2393 (void) mac_unregister(state->id_mh); 2394 2395 attach_fail_mac_register: 2396 ibd_drv_fini(state); 2397 2398 attach_fail_drv_init: 2399 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2400 ibd_print_warn(state, "failed to free IB resources"); 2401 2402 attach_fail_ibt_attach: 2403 if (ibd_tx_softintr == 1) 2404 ddi_remove_softintr(state->id_tx); 2405 2406 attach_fail_ddi_add_tx_softintr: 2407 if (ibd_rx_softintr == 1) 2408 ddi_remove_softintr(state->id_rx); 2409 2410 attach_fail_ddi_add_rx_softintr: 2411 ibd_state_fini(state); 2412 2413 attach_fail_state_init: 2414 ddi_soft_state_free(ibd_list, instance); 2415 2416 return (DDI_FAILURE); 2417 } 2418 2419 /* 2420 * Detach device from the IO framework. 2421 */ 2422 static int 2423 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2424 { 2425 ibd_state_t *state; 2426 int status; 2427 int instance; 2428 2429 switch (cmd) { 2430 case DDI_DETACH: 2431 break; 2432 case DDI_SUSPEND: 2433 default: 2434 return (DDI_FAILURE); 2435 } 2436 2437 instance = ddi_get_instance(dip); 2438 state = ddi_get_soft_state(ibd_list, instance); 2439 2440 /* 2441 * First, stop receive interrupts; this stops the 2442 * driver from handing up buffers to higher layers. 2443 * Wait for receive buffers to be returned; give up 2444 * after 5 seconds. 2445 */ 2446 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2447 status = 50; 2448 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2449 delay(drv_usectohz(100000)); 2450 if (--status == 0) { 2451 DPRINT(2, "ibd_detach : reclaiming failed"); 2452 goto failed; 2453 } 2454 } 2455 2456 if (mac_unregister(state->id_mh) != DDI_SUCCESS) { 2457 DPRINT(10, "ibd_detach : failed in mac_unregister()"); 2458 goto failed; 2459 } 2460 2461 if (ibd_rx_softintr == 1) 2462 ddi_remove_softintr(state->id_rx); 2463 2464 if (ibd_tx_softintr == 1) 2465 ddi_remove_softintr(state->id_tx); 2466 2467 ibd_drv_fini(state); 2468 2469 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2470 ibd_print_warn(state, "failed to free all IB resources at " 2471 "driver detach time"); 2472 2473 ibd_state_fini(state); 2474 ddi_soft_state_free(ibd_list, instance); 2475 return (DDI_SUCCESS); 2476 2477 failed: 2478 /* 2479 * Reap all the Tx/Rx completions that were posted since we 2480 * turned off the notification. Turn on notifications. There 2481 * is a race in that we do not reap completions that come in 2482 * after the poll and before notifications get turned on. That 2483 * is okay, the next rx/tx packet will trigger a completion 2484 * that will reap any missed completions. 2485 */ 2486 ibd_poll_compq(state, state->id_rcq_hdl); 2487 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2488 return (DDI_FAILURE); 2489 } 2490 2491 /* 2492 * Pre ibt_attach() driver initialization 2493 */ 2494 static int 2495 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2496 { 2497 char buf[64]; 2498 2499 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2500 state->id_link_state = LINK_STATE_UNKNOWN; 2501 2502 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2503 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2504 state->id_trap_stop = B_TRUE; 2505 state->id_trap_inprog = 0; 2506 2507 mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); 2508 state->id_dip = dip; 2509 2510 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2511 2512 state->id_tx_list.dl_head = NULL; 2513 state->id_tx_list.dl_tail = NULL; 2514 state->id_tx_list.dl_pending_sends = B_FALSE; 2515 state->id_tx_list.dl_cnt = 0; 2516 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2517 2518 state->id_rx_list.dl_head = NULL; 2519 state->id_rx_list.dl_tail = NULL; 2520 state->id_rx_list.dl_bufs_outstanding = 0; 2521 state->id_rx_list.dl_cnt = 0; 2522 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2523 mutex_init(&state->id_rx_mutex, NULL, MUTEX_DRIVER, NULL); 2524 2525 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2526 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2527 0, NULL, NULL, NULL, NULL, NULL, 0); 2528 2529 return (DDI_SUCCESS); 2530 } 2531 2532 /* 2533 * Post ibt_detach() driver deconstruction 2534 */ 2535 static void 2536 ibd_state_fini(ibd_state_t *state) 2537 { 2538 mutex_destroy(&state->id_tx_list.dl_mutex); 2539 mutex_destroy(&state->id_rx_list.dl_mutex); 2540 mutex_destroy(&state->id_rx_mutex); 2541 mutex_destroy(&state->id_sched_lock); 2542 mutex_destroy(&state->id_txcomp_lock); 2543 2544 cv_destroy(&state->id_trap_cv); 2545 mutex_destroy(&state->id_trap_lock); 2546 mutex_destroy(&state->id_link_mutex); 2547 kmem_cache_destroy(state->id_req_kmc); 2548 } 2549 2550 /* 2551 * Fetch IBA parameters for the network device from IB nexus. 2552 */ 2553 static int 2554 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2555 { 2556 /* 2557 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2558 * Note that the default partition is also allowed. 2559 */ 2560 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2561 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2562 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2563 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2564 "partition\n"); 2565 return (DDI_FAILURE); 2566 } 2567 2568 /* 2569 * ... the IBA port ... 2570 */ 2571 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2572 0, "port-number", 0); 2573 if (state->id_port == 0) { 2574 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2575 return (DDI_FAILURE); 2576 } 2577 2578 /* 2579 * ... and HCA GUID. 2580 */ 2581 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2582 0, "hca-guid", 0); 2583 if (*hca_guid == 0) { 2584 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2585 "guid\n"); 2586 return (DDI_FAILURE); 2587 } 2588 2589 return (DDI_SUCCESS); 2590 } 2591 2592 /* 2593 * Fetch link speed from SA for snmp ifspeed reporting. 2594 */ 2595 static uint64_t 2596 ibd_get_portspeed(ibd_state_t *state) 2597 { 2598 int ret; 2599 ibt_path_info_t path; 2600 ibt_path_attr_t path_attr; 2601 uint8_t num_paths; 2602 uint64_t ifspeed; 2603 2604 /* 2605 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2606 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2607 * 2000000000. Start with that as default. 2608 */ 2609 ifspeed = 2000000000; 2610 2611 bzero(&path_attr, sizeof (path_attr)); 2612 2613 /* 2614 * Get the port speed from Loopback path information. 2615 */ 2616 path_attr.pa_dgids = &state->id_sgid; 2617 path_attr.pa_num_dgids = 1; 2618 path_attr.pa_sgid = state->id_sgid; 2619 2620 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2621 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2622 goto earlydone; 2623 2624 if (num_paths < 1) 2625 goto earlydone; 2626 2627 /* 2628 * In case SA does not return an expected value, report the default 2629 * speed as 1X. 2630 */ 2631 ret = 1; 2632 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2633 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2634 ret = 1; 2635 break; 2636 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2637 ret = 4; 2638 break; 2639 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2640 ret = 12; 2641 break; 2642 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2643 ret = 2; 2644 break; 2645 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2646 ret = 8; 2647 break; 2648 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2649 ret = 16; 2650 break; 2651 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2652 ret = 24; 2653 break; 2654 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2655 ret = 32; 2656 break; 2657 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2658 ret = 48; 2659 break; 2660 } 2661 2662 ifspeed *= ret; 2663 2664 earlydone: 2665 return (ifspeed); 2666 } 2667 2668 /* 2669 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2670 * representing the input mcg mgid. 2671 */ 2672 static ibd_mce_t * 2673 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2674 { 2675 ibd_mce_t *ptr = list_head(mlist); 2676 2677 /* 2678 * Do plain linear search. 2679 */ 2680 while (ptr != NULL) { 2681 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2682 sizeof (ib_gid_t)) == 0) 2683 return (ptr); 2684 ptr = list_next(mlist, ptr); 2685 } 2686 return (NULL); 2687 } 2688 2689 /* 2690 * Execute IBA JOIN. 2691 */ 2692 static ibt_status_t 2693 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2694 { 2695 ibt_mcg_attr_t mcg_attr; 2696 2697 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2698 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2699 mcg_attr.mc_mgid = mgid; 2700 mcg_attr.mc_join_state = mce->mc_jstate; 2701 mcg_attr.mc_scope = state->id_scope; 2702 mcg_attr.mc_pkey = state->id_pkey; 2703 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2704 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2705 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2706 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2707 NULL, NULL)); 2708 } 2709 2710 /* 2711 * This code JOINs the port in the proper way (depending on the join 2712 * state) so that IBA fabric will forward mcg packets to/from the port. 2713 * It also attaches the QPN to the mcg so it can receive those mcg 2714 * packets. This code makes sure not to attach the mcg to the QP if 2715 * that has been previously done due to the mcg being joined with a 2716 * different join state, even though this is not required by SWG_0216, 2717 * refid 3610. 2718 */ 2719 static ibd_mce_t * 2720 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2721 { 2722 ibt_status_t ibt_status; 2723 ibd_mce_t *mce, *tmce, *omce = NULL; 2724 boolean_t do_attach = B_TRUE; 2725 2726 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2727 jstate, mgid.gid_prefix, mgid.gid_guid); 2728 2729 /* 2730 * For enable_multicast Full member joins, we need to do some 2731 * extra work. If there is already an mce on the list that 2732 * indicates full membership, that means the membership has 2733 * not yet been dropped (since the disable_multicast was issued) 2734 * because there are pending Tx's to the mcg; in that case, just 2735 * mark the mce not to be reaped when the Tx completion queues 2736 * an async reap operation. 2737 * 2738 * If there is already an mce on the list indicating sendonly 2739 * membership, try to promote to full membership. Be careful 2740 * not to deallocate the old mce, since there might be an AH 2741 * pointing to it; instead, update the old mce with new data 2742 * that tracks the full membership. 2743 */ 2744 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2745 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2746 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2747 ASSERT(omce->mc_fullreap); 2748 omce->mc_fullreap = B_FALSE; 2749 return (omce); 2750 } else { 2751 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2752 } 2753 } 2754 2755 /* 2756 * Allocate the ibd_mce_t to track this JOIN. 2757 */ 2758 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2759 mce->mc_fullreap = B_FALSE; 2760 mce->mc_jstate = jstate; 2761 2762 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2763 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2764 ibt_status); 2765 kmem_free(mce, sizeof (ibd_mce_t)); 2766 return (NULL); 2767 } 2768 2769 /* 2770 * Is an IBA attach required? Not if the interface is already joined 2771 * to the mcg in a different appropriate join state. 2772 */ 2773 if (jstate == IB_MC_JSTATE_NON) { 2774 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2775 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2776 do_attach = B_FALSE; 2777 } else if (jstate == IB_MC_JSTATE_FULL) { 2778 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2779 do_attach = B_FALSE; 2780 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2781 do_attach = B_FALSE; 2782 } 2783 2784 if (do_attach) { 2785 /* 2786 * Do the IBA attach. 2787 */ 2788 DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); 2789 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2790 &mce->mc_info)) != IBT_SUCCESS) { 2791 DPRINT(10, "ibd_join_group : failed qp attachment " 2792 "%d\n", ibt_status); 2793 /* 2794 * NOTE that we should probably preserve the join info 2795 * in the list and later try to leave again at detach 2796 * time. 2797 */ 2798 (void) ibt_leave_mcg(state->id_sgid, mgid, 2799 state->id_sgid, jstate); 2800 kmem_free(mce, sizeof (ibd_mce_t)); 2801 return (NULL); 2802 } 2803 } 2804 2805 /* 2806 * Insert the ibd_mce_t in the proper list. 2807 */ 2808 if (jstate == IB_MC_JSTATE_NON) { 2809 IBD_MCACHE_INSERT_NON(state, mce); 2810 } else { 2811 /* 2812 * Set up the mc_req fields used for reaping the 2813 * mcg in case of delayed tx completion (see 2814 * ibd_tx_cleanup()). Also done for sendonly join in 2815 * case we are promoted to fullmembership later and 2816 * keep using the same mce. 2817 */ 2818 mce->mc_req.rq_gid = mgid; 2819 mce->mc_req.rq_ptr = mce; 2820 /* 2821 * Check whether this is the case of trying to join 2822 * full member, and we were already joined send only. 2823 * We try to drop our SendOnly membership, but it is 2824 * possible that the mcg does not exist anymore (and 2825 * the subnet trap never reached us), so the leave 2826 * operation might fail. 2827 */ 2828 if (omce != NULL) { 2829 (void) ibt_leave_mcg(state->id_sgid, mgid, 2830 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2831 omce->mc_jstate = IB_MC_JSTATE_FULL; 2832 bcopy(&mce->mc_info, &omce->mc_info, 2833 sizeof (ibt_mcg_info_t)); 2834 kmem_free(mce, sizeof (ibd_mce_t)); 2835 return (omce); 2836 } 2837 mutex_enter(&state->id_mc_mutex); 2838 IBD_MCACHE_INSERT_FULL(state, mce); 2839 mutex_exit(&state->id_mc_mutex); 2840 } 2841 2842 return (mce); 2843 } 2844 2845 /* 2846 * Called during port up event handling to attempt to reacquire full 2847 * membership to an mcg. Stripped down version of ibd_join_group(). 2848 * Note that it is possible that the mcg might have gone away, and 2849 * gets recreated at this point. 2850 */ 2851 static void 2852 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2853 { 2854 ib_gid_t mgid; 2855 2856 /* 2857 * If the mc_fullreap flag is set, or this join fails, a subsequent 2858 * reap/leave is going to try to leave the group. We could prevent 2859 * that by adding a boolean flag into ibd_mce_t, if required. 2860 */ 2861 if (mce->mc_fullreap) 2862 return; 2863 2864 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2865 2866 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2867 mgid.gid_guid); 2868 2869 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2870 ibd_print_warn(state, "Failure on port up to rejoin " 2871 "multicast gid %016llx:%016llx", 2872 (u_longlong_t)mgid.gid_prefix, 2873 (u_longlong_t)mgid.gid_guid); 2874 } 2875 2876 /* 2877 * This code handles delayed Tx completion cleanups for mcg's to which 2878 * disable_multicast has been issued, regular mcg related cleanups during 2879 * disable_multicast, disable_promiscous and mcg traps, as well as 2880 * cleanups during driver detach time. Depending on the join state, 2881 * it deletes the mce from the appropriate list and issues the IBA 2882 * leave/detach; except in the disable_multicast case when the mce 2883 * is left on the active list for a subsequent Tx completion cleanup. 2884 */ 2885 static void 2886 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2887 uint8_t jstate) 2888 { 2889 ibd_mce_t *tmce; 2890 boolean_t do_detach = B_TRUE; 2891 2892 /* 2893 * Before detaching, we must check whether the other list 2894 * contains the mcg; if we detach blindly, the consumer 2895 * who set up the other list will also stop receiving 2896 * traffic. 2897 */ 2898 if (jstate == IB_MC_JSTATE_FULL) { 2899 /* 2900 * The following check is only relevant while coming 2901 * from the Tx completion path in the reap case. 2902 */ 2903 if (!mce->mc_fullreap) 2904 return; 2905 mutex_enter(&state->id_mc_mutex); 2906 IBD_MCACHE_PULLOUT_FULL(state, mce); 2907 mutex_exit(&state->id_mc_mutex); 2908 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2909 do_detach = B_FALSE; 2910 } else if (jstate == IB_MC_JSTATE_NON) { 2911 IBD_MCACHE_PULLOUT_NON(state, mce); 2912 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2913 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2914 do_detach = B_FALSE; 2915 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2916 mutex_enter(&state->id_mc_mutex); 2917 IBD_MCACHE_PULLOUT_FULL(state, mce); 2918 mutex_exit(&state->id_mc_mutex); 2919 do_detach = B_FALSE; 2920 } 2921 2922 /* 2923 * If we are reacting to a mcg trap and leaving our sendonly or 2924 * non membership, the mcg is possibly already gone, so attempting 2925 * to leave might fail. On the other hand, we must try to leave 2926 * anyway, since this might be a trap from long ago, and we could 2927 * have potentially sendonly joined to a recent incarnation of 2928 * the mcg and are about to loose track of this information. 2929 */ 2930 if (do_detach) { 2931 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2932 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2933 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2934 } 2935 2936 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2937 kmem_free(mce, sizeof (ibd_mce_t)); 2938 } 2939 2940 /* 2941 * Async code executed due to multicast and promiscuous disable requests 2942 * and mcg trap handling; also executed during driver detach. Mostly, a 2943 * leave and detach is done; except for the fullmember case when Tx 2944 * requests are pending, whence arrangements are made for subsequent 2945 * cleanup on Tx completion. 2946 */ 2947 static void 2948 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2949 { 2950 ipoib_mac_t mcmac; 2951 boolean_t recycled; 2952 ibd_mce_t *mce; 2953 2954 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 2955 jstate, mgid.gid_prefix, mgid.gid_guid); 2956 2957 if (jstate == IB_MC_JSTATE_NON) { 2958 recycled = B_TRUE; 2959 mce = IBD_MCACHE_FIND_NON(state, mgid); 2960 /* 2961 * In case we are handling a mcg trap, we might not find 2962 * the mcg in the non list. 2963 */ 2964 if (mce == NULL) 2965 return; 2966 } else { 2967 mce = IBD_MCACHE_FIND_FULL(state, mgid); 2968 2969 /* 2970 * In case we are handling a mcg trap, make sure the trap 2971 * is not arriving late; if we have an mce that indicates 2972 * that we are already a fullmember, that would be a clear 2973 * indication that the trap arrived late (ie, is for a 2974 * previous incarnation of the mcg). 2975 */ 2976 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 2977 if ((mce == NULL) || (mce->mc_jstate == 2978 IB_MC_JSTATE_FULL)) 2979 return; 2980 ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2981 } else { 2982 ASSERT(jstate == IB_MC_JSTATE_FULL); 2983 ASSERT(mce->mc_jstate == IB_MC_JSTATE_FULL); 2984 2985 /* 2986 * If join group failed, mce will be NULL here. 2987 * This is because in GLDv3 driver, set multicast 2988 * will always return success. 2989 */ 2990 if (mce == NULL) 2991 return; 2992 mce->mc_fullreap = B_TRUE; 2993 } 2994 2995 /* 2996 * If no pending Tx's remain that reference the AH 2997 * for the mcg, recycle it from active to free list. 2998 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 2999 * so the last completing Tx will cause an async reap 3000 * operation to be invoked, at which time we will drop our 3001 * membership to the mcg so that the pending Tx's complete 3002 * successfully. Refer to comments on "AH and MCE active 3003 * list manipulation" at top of this file. The lock protects 3004 * against Tx fast path and Tx cleanup code. 3005 */ 3006 mutex_enter(&state->id_ac_mutex); 3007 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3008 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3009 IB_MC_JSTATE_SEND_ONLY_NON)); 3010 mutex_exit(&state->id_ac_mutex); 3011 } 3012 3013 if (recycled) { 3014 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3015 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3016 ibd_async_reap_group(state, mce, mgid, jstate); 3017 } 3018 } 3019 3020 /* 3021 * Find the broadcast address as defined by IPoIB; implicitly 3022 * determines the IBA scope, mtu, tclass etc of the link the 3023 * interface is going to be a member of. 3024 */ 3025 static ibt_status_t 3026 ibd_find_bgroup(ibd_state_t *state) 3027 { 3028 ibt_mcg_attr_t mcg_attr; 3029 uint_t numg; 3030 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3031 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3032 IB_MC_SCOPE_GLOBAL }; 3033 int i, mcgmtu; 3034 boolean_t found = B_FALSE; 3035 3036 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3037 mcg_attr.mc_pkey = state->id_pkey; 3038 state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; 3039 3040 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3041 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3042 3043 /* 3044 * Look for the IPoIB broadcast group. 3045 */ 3046 state->id_mgid.gid_prefix = 3047 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3048 ((uint64_t)state->id_scope << 48) | 3049 ((uint32_t)(state->id_pkey << 16))); 3050 mcg_attr.mc_mgid = state->id_mgid; 3051 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3052 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3053 found = B_TRUE; 3054 break; 3055 } 3056 3057 } 3058 3059 if (!found) { 3060 ibd_print_warn(state, "IPoIB broadcast group absent"); 3061 return (IBT_FAILURE); 3062 } 3063 3064 /* 3065 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3066 */ 3067 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3068 if (state->id_mtu < mcgmtu) { 3069 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3070 "greater than port's maximum MTU %d", mcgmtu, 3071 state->id_mtu); 3072 return (IBT_FAILURE); 3073 } 3074 state->id_mtu = mcgmtu; 3075 3076 return (IBT_SUCCESS); 3077 } 3078 3079 /* 3080 * Post ibt_attach() initialization. 3081 */ 3082 static int 3083 ibd_drv_init(ibd_state_t *state) 3084 { 3085 kthread_t *kht; 3086 ibt_ud_chan_alloc_args_t ud_alloc_attr; 3087 ibt_ud_chan_query_attr_t ud_chan_attr; 3088 ibt_hca_portinfo_t *port_infop; 3089 ibt_hca_attr_t hca_attrs; 3090 ibt_status_t ibt_status; 3091 ibt_cq_attr_t cq_attr; 3092 ib_guid_t hca_guid; 3093 uint32_t real_size; 3094 uint32_t *ptr; 3095 char pathname[OBP_MAXPATHLEN]; 3096 uint_t psize, port_infosz; 3097 3098 /* 3099 * Initialize id_port before ibt_open_hca because of 3100 * ordering requirements in port up/down handling. 3101 */ 3102 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 3103 return (DDI_FAILURE); 3104 3105 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 3106 &state->id_hca_hdl) != IBT_SUCCESS) { 3107 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 3108 return (DDI_FAILURE); 3109 } 3110 3111 mutex_enter(&state->id_link_mutex); 3112 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 3113 state->id_port, &port_infop, &psize, 3114 &port_infosz); 3115 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 3116 mutex_exit(&state->id_link_mutex); 3117 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 3118 (void) ibt_close_hca(state->id_hca_hdl); 3119 return (DDI_FAILURE); 3120 } 3121 3122 /* 3123 * If the link already went down by the time we get here, give up; 3124 * we can not even get the gid since that is not valid. We would 3125 * fail in ibd_find_bgroup() anyway. 3126 */ 3127 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3128 mutex_exit(&state->id_link_mutex); 3129 ibt_free_portinfo(port_infop, port_infosz); 3130 (void) ibt_close_hca(state->id_hca_hdl); 3131 ibd_print_warn(state, "Port is not active"); 3132 return (DDI_FAILURE); 3133 } 3134 3135 /* 3136 * This verifies the Pkey ibnexus handed us is still valid. 3137 * This is also the point from which the pkey table for the 3138 * port must hold the exact pkey value at the exact index 3139 * across port up/downs. 3140 */ 3141 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3142 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3143 mutex_exit(&state->id_link_mutex); 3144 ibt_free_portinfo(port_infop, port_infosz); 3145 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3146 (void) ibt_close_hca(state->id_hca_hdl); 3147 return (DDI_FAILURE); 3148 } 3149 3150 state->id_mtu = (128 << port_infop->p_mtu); 3151 state->id_sgid = *port_infop->p_sgid_tbl; 3152 state->id_link_state = LINK_STATE_UP; 3153 mutex_exit(&state->id_link_mutex); 3154 3155 ibt_free_portinfo(port_infop, port_infosz); 3156 3157 state->id_link_speed = ibd_get_portspeed(state); 3158 3159 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3160 ASSERT(ibt_status == IBT_SUCCESS); 3161 3162 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3163 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3164 goto drv_init_fail_find_bgroup; 3165 } 3166 3167 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3168 &state->id_pd_hdl) != IBT_SUCCESS) { 3169 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3170 goto drv_init_fail_alloc_pd; 3171 } 3172 3173 /* Initialize the parallel ARP cache and AHs */ 3174 if (ibd_acache_init(state) != DDI_SUCCESS) { 3175 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3176 goto drv_init_fail_acache; 3177 } 3178 3179 /* 3180 * Check various tunable limits. 3181 */ 3182 if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { 3183 ibd_print_warn(state, "Setting #sgl = %d instead of default %d", 3184 hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); 3185 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3186 } else { 3187 state->id_max_sqseg = IBD_MAX_SQSEG; 3188 } 3189 3190 /* 3191 * First, check #r/s wqes against max channel size. 3192 */ 3193 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3194 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3195 else 3196 state->id_num_rwqe = IBD_NUM_RWQE; 3197 3198 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3199 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3200 else 3201 state->id_num_swqe = IBD_NUM_SWQE; 3202 3203 /* 3204 * Allocate Rx/combined CQ: 3205 * Theoretically, there is no point in having more than #rwqe 3206 * plus #swqe cqe's, except that the CQ will be signalled for 3207 * overflow when the last wqe completes, if none of the previous 3208 * cqe's have been polled. Thus, we allocate just a few less wqe's 3209 * to make sure such overflow does not occur. 3210 */ 3211 cq_attr.cq_sched = NULL; 3212 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3213 3214 if (ibd_separate_cqs == 1) { 3215 /* 3216 * Allocate Receive CQ. 3217 */ 3218 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3219 cq_attr.cq_size = state->id_num_rwqe + 1; 3220 } else { 3221 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3222 state->id_num_rwqe = cq_attr.cq_size - 1; 3223 } 3224 3225 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3226 ibd_print_warn(state, "Computed #rwqe %d based on " 3227 "requested size and supportable CQ size is less " 3228 "than the required threshold %d", 3229 state->id_num_rwqe, IBD_RX_THRESHOLD); 3230 goto drv_init_fail_min_rwqes; 3231 } 3232 3233 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3234 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3235 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3236 goto drv_init_fail_alloc_rcq; 3237 } 3238 state->id_rxwcs_size = state->id_num_rwqe + 1; 3239 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3240 state->id_rxwcs_size, KM_SLEEP); 3241 3242 3243 /* 3244 * Allocate Send CQ. 3245 */ 3246 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3247 cq_attr.cq_size = state->id_num_swqe + 1; 3248 } else { 3249 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3250 state->id_num_swqe = cq_attr.cq_size - 1; 3251 } 3252 3253 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3254 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3255 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3256 goto drv_init_fail_alloc_scq; 3257 } 3258 state->id_txwcs_size = state->id_num_swqe + 1; 3259 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 3260 state->id_txwcs_size, KM_SLEEP); 3261 } else { 3262 /* 3263 * Allocate combined Send/Receive CQ. 3264 */ 3265 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3266 state->id_num_swqe + 1)) { 3267 cq_attr.cq_size = state->id_num_rwqe + 3268 state->id_num_swqe + 1; 3269 } else { 3270 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3271 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3272 state->id_num_rwqe) / (state->id_num_rwqe + 3273 state->id_num_swqe); 3274 state->id_num_swqe = cq_attr.cq_size - 1 - 3275 state->id_num_rwqe; 3276 } 3277 3278 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3279 ibd_print_warn(state, "Computed #rwqe %d based on " 3280 "requested size and supportable CQ size is less " 3281 "than the required threshold %d", 3282 state->id_num_rwqe, IBD_RX_THRESHOLD); 3283 goto drv_init_fail_min_rwqes; 3284 } 3285 3286 state->id_rxwcs_size = cq_attr.cq_size; 3287 state->id_txwcs_size = state->id_rxwcs_size; 3288 3289 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3290 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3291 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3292 goto drv_init_fail_alloc_rcq; 3293 } 3294 state->id_scq_hdl = state->id_rcq_hdl; 3295 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3296 state->id_rxwcs_size, KM_SLEEP); 3297 state->id_txwcs = state->id_rxwcs; 3298 } 3299 3300 /* 3301 * Print message in case we could not allocate as many wqe's 3302 * as was requested. Note that in the combined CQ case, we will 3303 * get the following message. 3304 */ 3305 if (state->id_num_rwqe != IBD_NUM_RWQE) 3306 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3307 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3308 if (state->id_num_swqe != IBD_NUM_SWQE) 3309 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3310 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3311 3312 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3313 ud_alloc_attr.ud_hca_port_num = state->id_port; 3314 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3315 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3316 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3317 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3318 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3319 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3320 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3321 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3322 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3323 ud_alloc_attr.ud_clone_chan = NULL; 3324 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3325 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3326 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3327 "\n"); 3328 goto drv_init_fail_alloc_chan; 3329 } 3330 3331 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3332 DDI_SUCCESS) { 3333 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3334 goto drv_init_fail_query_chan; 3335 } 3336 state->id_qpnum = ud_chan_attr.ud_qpn; 3337 3338 /* Initialize the Transmit buffer list */ 3339 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3340 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3341 goto drv_init_fail_txlist_init; 3342 } 3343 3344 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3345 /* Setup the handler we will use for regular DLPI stuff */ 3346 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3347 if (ibt_enable_cq_notify(state->id_scq_hdl, 3348 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3349 DPRINT(10, "ibd_drv_init : failed in" 3350 " ibt_enable_cq_notify()\n"); 3351 goto drv_init_fail_cq_notify; 3352 } 3353 } 3354 3355 /* Create the service fifos before we start receiving */ 3356 if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, 3357 state)) == NULL) { 3358 DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); 3359 goto drv_init_fail_srv_fifo; 3360 } 3361 3362 /* Initialize the Receive buffer list */ 3363 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3364 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3365 goto drv_init_fail_rxlist_init; 3366 } 3367 3368 /* Join to IPoIB broadcast group as required by IPoIB */ 3369 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3370 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3371 goto drv_init_fail_join_group; 3372 } 3373 3374 /* Create the async thread */ 3375 if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3376 TS_RUN, minclsyspri)) == NULL) { 3377 /* Do we have to specially leave the group? */ 3378 DPRINT(10, "ibd_drv_init : failed in thread_create\n"); 3379 goto drv_init_fail_thread_create; 3380 } 3381 state->id_async_thrid = kht->t_did; 3382 3383 /* 3384 * The local mac address is now known. Create the IPoIB 3385 * address. 3386 */ 3387 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3388 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3389 /* 3390 * Similarly, program in the broadcast mac address. 3391 */ 3392 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3393 state->id_mgid.gid_guid); 3394 3395 ptr = (uint32_t *)&state->id_macaddr; 3396 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3397 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3398 ptr = (uint32_t *)&state->id_bcaddr; 3399 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3400 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3401 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3402 state->id_pkey, state->id_mgid.gid_prefix, 3403 state->id_mgid.gid_guid); 3404 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3405 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3406 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3407 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3408 (void) ddi_pathname(state->id_dip, pathname); 3409 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3410 3411 return (DDI_SUCCESS); 3412 3413 drv_init_fail_thread_create: 3414 ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); 3415 3416 drv_init_fail_join_group: 3417 ibd_fini_rxlist(state); 3418 3419 drv_init_fail_rxlist_init: 3420 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3421 3422 drv_init_fail_srv_fifo: 3423 drv_init_fail_cq_notify: 3424 ibd_fini_txlist(state); 3425 3426 drv_init_fail_txlist_init: 3427 drv_init_fail_query_chan: 3428 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3429 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3430 3431 drv_init_fail_alloc_chan: 3432 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3433 IBT_SUCCESS)) 3434 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3435 3436 if (ibd_separate_cqs == 1) 3437 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 3438 state->id_txwcs_size); 3439 3440 drv_init_fail_alloc_scq: 3441 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3442 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3443 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 3444 3445 drv_init_fail_min_rwqes: 3446 drv_init_fail_alloc_rcq: 3447 ibd_acache_fini(state); 3448 drv_init_fail_acache: 3449 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3450 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3451 3452 drv_init_fail_alloc_pd: 3453 ibt_free_mcg_info(state->id_mcinfo, 1); 3454 drv_init_fail_find_bgroup: 3455 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3456 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3457 3458 return (DDI_FAILURE); 3459 } 3460 3461 /* 3462 * Allocate the statically allocated Tx buffer list. 3463 */ 3464 static int 3465 ibd_init_txlist(ibd_state_t *state) 3466 { 3467 ibd_swqe_t *swqe; 3468 int i; 3469 3470 for (i = 0; i < state->id_num_swqe; i++) { 3471 if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { 3472 DPRINT(10, "ibd_init_txlist : failed in " 3473 "ibd_alloc_swqe()\n"); 3474 ibd_fini_txlist(state); 3475 return (DDI_FAILURE); 3476 } 3477 3478 /* add to list */ 3479 state->id_tx_list.dl_cnt++; 3480 if (state->id_tx_list.dl_head == NULL) { 3481 swqe->swqe_prev = NULL; 3482 swqe->swqe_next = NULL; 3483 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3484 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3485 } else { 3486 swqe->swqe_prev = state->id_tx_list.dl_tail; 3487 swqe->swqe_next = NULL; 3488 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3489 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3490 } 3491 } 3492 3493 return (DDI_SUCCESS); 3494 } 3495 3496 /* 3497 * Free the statically allocated Tx buffer list. 3498 */ 3499 static void 3500 ibd_fini_txlist(ibd_state_t *state) 3501 { 3502 ibd_swqe_t *node; 3503 3504 mutex_enter(&state->id_tx_list.dl_mutex); 3505 while (state->id_tx_list.dl_head != NULL) { 3506 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3507 state->id_tx_list.dl_head = node->swqe_next; 3508 state->id_tx_list.dl_cnt--; 3509 ASSERT(state->id_tx_list.dl_cnt >= 0); 3510 ibd_free_swqe(state, node); 3511 } 3512 mutex_exit(&state->id_tx_list.dl_mutex); 3513 } 3514 3515 /* 3516 * Allocate a single send wqe and register it so it is almost 3517 * ready to be posted to the hardware. 3518 */ 3519 static int 3520 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) 3521 { 3522 ibt_mr_attr_t mem_attr; 3523 ibd_swqe_t *swqe; 3524 3525 swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); 3526 *wqe = swqe; 3527 swqe->swqe_type = IBD_WQE_SEND; 3528 swqe->swqe_next = NULL; 3529 swqe->swqe_prev = NULL; 3530 swqe->swqe_im_mblk = NULL; 3531 3532 /* alloc copy buffer, must be max size to handle multiple mblk case */ 3533 swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); 3534 3535 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3536 mem_attr.mr_len = state->id_mtu; 3537 mem_attr.mr_as = NULL; 3538 mem_attr.mr_flags = IBT_MR_SLEEP; 3539 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3540 &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != 3541 IBT_SUCCESS) { 3542 DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); 3543 kmem_free(swqe->swqe_copybuf.ic_bufaddr, 3544 state->id_mtu); 3545 kmem_free(swqe, sizeof (ibd_swqe_t)); 3546 return (DDI_FAILURE); 3547 } 3548 3549 swqe->swqe_copybuf.ic_sgl.ds_va = 3550 (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3551 swqe->swqe_copybuf.ic_sgl.ds_key = 3552 swqe->swqe_copybuf.ic_mr_desc.md_lkey; 3553 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3554 3555 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3556 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3557 swqe->w_swr.wr_trans = IBT_UD_SRV; 3558 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3559 3560 /* These are set in send */ 3561 swqe->w_swr.wr_nds = 0; 3562 swqe->w_swr.wr_sgl = NULL; 3563 3564 return (DDI_SUCCESS); 3565 } 3566 3567 /* 3568 * Free an allocated send wqe. 3569 */ 3570 static void 3571 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3572 { 3573 3574 if (ibt_deregister_mr(state->id_hca_hdl, 3575 swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3576 DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); 3577 return; 3578 } 3579 kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); 3580 kmem_free(swqe, sizeof (ibd_swqe_t)); 3581 } 3582 3583 /* 3584 * Post a rwqe to the hardware and add it to the Rx list. The 3585 * "recycle" parameter indicates whether an old rwqe is being 3586 * recycled, or this is a new one. 3587 */ 3588 static int 3589 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3590 { 3591 /* 3592 * Here we should add dl_cnt before post recv, because we would 3593 * have to make sure dl_cnt has already updated before 3594 * corresponding ibd_process_rx() is called. 3595 */ 3596 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3597 if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != 3598 IBT_SUCCESS) { 3599 (void) atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 3600 DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); 3601 return (DDI_FAILURE); 3602 } 3603 3604 /* 3605 * Buffers being recycled are already in the list. 3606 */ 3607 if (recycle) 3608 return (DDI_SUCCESS); 3609 3610 mutex_enter(&state->id_rx_list.dl_mutex); 3611 if (state->id_rx_list.dl_head == NULL) { 3612 rwqe->rwqe_prev = NULL; 3613 rwqe->rwqe_next = NULL; 3614 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3615 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3616 } else { 3617 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3618 rwqe->rwqe_next = NULL; 3619 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3620 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3621 } 3622 mutex_exit(&state->id_rx_list.dl_mutex); 3623 3624 return (DDI_SUCCESS); 3625 } 3626 3627 /* 3628 * Allocate the statically allocated Rx buffer list. 3629 */ 3630 static int 3631 ibd_init_rxlist(ibd_state_t *state) 3632 { 3633 ibd_rwqe_t *rwqe; 3634 int i; 3635 3636 for (i = 0; i < state->id_num_rwqe; i++) { 3637 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3638 ibd_fini_rxlist(state); 3639 return (DDI_FAILURE); 3640 } 3641 3642 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3643 ibd_free_rwqe(state, rwqe); 3644 ibd_fini_rxlist(state); 3645 return (DDI_FAILURE); 3646 } 3647 } 3648 3649 return (DDI_SUCCESS); 3650 } 3651 3652 /* 3653 * Free the statically allocated Rx buffer list. 3654 * 3655 */ 3656 static void 3657 ibd_fini_rxlist(ibd_state_t *state) 3658 { 3659 ibd_rwqe_t *node; 3660 3661 mutex_enter(&state->id_rx_list.dl_mutex); 3662 while (state->id_rx_list.dl_head != NULL) { 3663 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3664 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3665 state->id_rx_list.dl_cnt--; 3666 ASSERT(state->id_rx_list.dl_cnt >= 0); 3667 3668 ibd_free_rwqe(state, node); 3669 } 3670 mutex_exit(&state->id_rx_list.dl_mutex); 3671 } 3672 3673 /* 3674 * Allocate a single recv wqe and register it so it is almost 3675 * ready to be posted to the hardware. 3676 */ 3677 static int 3678 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3679 { 3680 ibt_mr_attr_t mem_attr; 3681 ibd_rwqe_t *rwqe; 3682 3683 if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3684 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3685 return (DDI_FAILURE); 3686 } 3687 *wqe = rwqe; 3688 rwqe->rwqe_type = IBD_WQE_RECV; 3689 rwqe->w_state = state; 3690 rwqe->rwqe_next = NULL; 3691 rwqe->rwqe_prev = NULL; 3692 rwqe->w_freeing_wqe = B_FALSE; 3693 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3694 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3695 3696 if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3697 IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { 3698 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); 3699 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3700 return (DDI_FAILURE); 3701 } 3702 3703 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3704 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3705 NULL) { 3706 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3707 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3708 state->id_mtu + IPOIB_GRH_SIZE); 3709 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3710 return (DDI_FAILURE); 3711 } 3712 3713 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3714 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3715 mem_attr.mr_as = NULL; 3716 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3717 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3718 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3719 IBT_SUCCESS) { 3720 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3721 rwqe->w_freeing_wqe = B_TRUE; 3722 freemsg(rwqe->rwqe_im_mblk); 3723 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3724 state->id_mtu + IPOIB_GRH_SIZE); 3725 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3726 return (DDI_FAILURE); 3727 } 3728 3729 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3730 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3731 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3732 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3733 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3734 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3735 rwqe->w_rwr.wr_nds = 1; 3736 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3737 3738 return (DDI_SUCCESS); 3739 } 3740 3741 /* 3742 * Free an allocated recv wqe. 3743 */ 3744 static void 3745 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3746 { 3747 3748 if (ibt_deregister_mr(state->id_hca_hdl, 3749 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3750 DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); 3751 return; 3752 } 3753 3754 /* 3755 * Indicate to the callback function that this rwqe/mblk 3756 * should not be recycled. The freemsg() will invoke 3757 * ibd_freemsg_cb(). 3758 */ 3759 if (rwqe->rwqe_im_mblk != NULL) { 3760 rwqe->w_freeing_wqe = B_TRUE; 3761 freemsg(rwqe->rwqe_im_mblk); 3762 } 3763 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3764 state->id_mtu + IPOIB_GRH_SIZE); 3765 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3766 } 3767 3768 /* 3769 * Delete the rwqe being freed from the rx list. 3770 */ 3771 static void 3772 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3773 { 3774 mutex_enter(&state->id_rx_list.dl_mutex); 3775 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3776 state->id_rx_list.dl_head = rwqe->rwqe_next; 3777 else 3778 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3779 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3780 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3781 else 3782 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3783 mutex_exit(&state->id_rx_list.dl_mutex); 3784 } 3785 3786 /* 3787 * Pre ibt_detach() deconstruction. 3788 */ 3789 static void 3790 ibd_drv_fini(ibd_state_t *state) 3791 { 3792 ib_gid_t mgid; 3793 ibd_mce_t *mce; 3794 ibt_status_t status; 3795 uint8_t jstate; 3796 3797 /* 3798 * Desubscribe from trap notices; we will be tearing down 3799 * the mcg lists soon. Make sure the trap handler does nothing 3800 * even if it is invoked (ie till we invoke ibt_detach()). 3801 */ 3802 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 3803 mutex_enter(&state->id_trap_lock); 3804 state->id_trap_stop = B_TRUE; 3805 while (state->id_trap_inprog > 0) 3806 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 3807 mutex_exit(&state->id_trap_lock); 3808 3809 /* 3810 * Flushing the channel ensures that all pending WQE's 3811 * are marked with flush_error and handed to the CQ. It 3812 * does not guarantee the invocation of the CQ handler. 3813 * This call is guaranteed to return successfully for UD QPNs. 3814 */ 3815 status = ibt_flush_channel(state->id_chnl_hdl); 3816 ASSERT(status == IBT_SUCCESS); 3817 3818 /* 3819 * We possibly need a loop here to wait for all the Tx 3820 * callbacks to happen. The Tx handlers will retrieve 3821 * held resources like AH ac_ref count, registered memory 3822 * and possibly ASYNC_REAP requests. Rx interrupts were already 3823 * turned off (in ibd_detach()); turn off Tx interrupts and 3824 * poll. By the time the polling returns an empty indicator, 3825 * we are sure we have seen all pending Tx callbacks. Note 3826 * that after the ibt_set_cq_handler() returns, the old handler 3827 * is guaranteed not to be invoked anymore. 3828 */ 3829 if (ibd_separate_cqs == 1) 3830 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 3831 ibd_poll_compq(state, state->id_scq_hdl); 3832 3833 /* 3834 * No more async requests will be posted since the device has been 3835 * unregistered; completion handlers have been turned off, so Tx 3836 * handler will not cause any more ASYNC_REAP requests. Queue a 3837 * request for the async thread to exit, which will be serviced 3838 * after any pending ones. This can take a while, specially if the 3839 * SM is unreachable, since IBMF will slowly timeout each SM request 3840 * issued by the async thread. Reap the thread before continuing on, 3841 * we do not want it to be lingering in modunloaded code. 3842 */ 3843 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); 3844 thread_join(state->id_async_thrid); 3845 3846 /* 3847 * We can not be in promiscuous mode anymore, upper layers 3848 * would have made a request to disable it (if ever set previously) 3849 * before the detach is allowed to progress to this point; and the 3850 * aysnc thread would have processed that request by now. Thus the 3851 * nonmember list is guaranteed empty at this point. 3852 */ 3853 ASSERT(state->id_prom_op != COMPLETED); 3854 3855 /* 3856 * Drop all residual full/non membership. This includes full 3857 * membership to the broadcast group, and any nonmembership 3858 * acquired during transmits. We do this after the Tx completion 3859 * handlers are done, since those might result in some late 3860 * leaves; this also eliminates a potential race with that 3861 * path wrt the mc full list insert/delete. Trap handling 3862 * has also been suppressed at this point. Thus, no locks 3863 * are required while traversing the mc full list. 3864 */ 3865 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 3866 mce = list_head(&state->id_mc_full); 3867 while (mce != NULL) { 3868 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3869 jstate = mce->mc_jstate; 3870 mce = list_next(&state->id_mc_full, mce); 3871 ibd_leave_group(state, mgid, jstate); 3872 } 3873 3874 ibt_free_mcg_info(state->id_mcinfo, 1); 3875 3876 /* 3877 * Kill the channel now; guaranteed to return successfully 3878 * for UD QPNs. 3879 */ 3880 status = ibt_free_channel(state->id_chnl_hdl); 3881 ASSERT(status == IBT_SUCCESS); 3882 3883 /* 3884 * Kill the CQ; all completion handlers are guaranteed to 3885 * have terminated by the time this returns. Since we killed 3886 * the QPN above, we can not receive the IBT_CQ_BUSY error. 3887 */ 3888 status = ibt_free_cq(state->id_rcq_hdl); 3889 ASSERT(status == IBT_SUCCESS); 3890 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 3891 3892 if (ibd_separate_cqs == 1) { 3893 status = ibt_free_cq(state->id_scq_hdl); 3894 ASSERT(status == IBT_SUCCESS); 3895 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 3896 state->id_txwcs_size); 3897 } 3898 3899 /* 3900 * We killed the receive interrupts, thus, we will not be 3901 * required to handle received packets anymore. Thus, kill 3902 * service threads since they are not going to be used anymore. 3903 */ 3904 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3905 3906 /* 3907 * Since these following will act on the Rx/Tx list, which 3908 * is also looked at by the Rx/Tx handlers, keep them around 3909 * till all handlers are guaranteed to have completed. 3910 */ 3911 ibd_fini_rxlist(state); 3912 ibd_fini_txlist(state); 3913 3914 /* 3915 * Clean up the active AH hash list. 3916 */ 3917 mod_hash_destroy_hash(state->id_ah_active_hash); 3918 3919 /* 3920 * Free parallel ARP cache and AHs; we are sure all of these 3921 * resources have been released by the Tx completion handler. 3922 */ 3923 ibd_acache_fini(state); 3924 3925 /* 3926 * We freed the QPN, all the MRs and AHs. This step should not 3927 * fail; print a warning message if it does fail, due to a bug 3928 * in the driver. 3929 */ 3930 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3931 ibd_print_warn(state, "failed to free protection domain"); 3932 3933 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3934 ibd_print_warn(state, "failed to close HCA device"); 3935 } 3936 3937 /* 3938 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3939 * threaded and nonreentrant for this CQ. When using combined CQ, 3940 * this handles Tx and Rx completions. With separate CQs, this handles 3941 * only Rx completions. 3942 */ 3943 /* ARGSUSED */ 3944 static void 3945 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3946 { 3947 ibd_state_t *state = (ibd_state_t *)arg; 3948 3949 atomic_add_64(&state->id_num_intrs, 1); 3950 3951 if (ibd_rx_softintr == 1) 3952 ddi_trigger_softintr(state->id_rx); 3953 else 3954 (void) ibd_intr((char *)state); 3955 } 3956 3957 /* 3958 * Separate CQ handler for Tx completions, when the Tx CQ is in 3959 * interrupt driven mode. 3960 */ 3961 /* ARGSUSED */ 3962 static void 3963 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3964 { 3965 ibd_state_t *state = (ibd_state_t *)arg; 3966 3967 atomic_add_64(&state->id_num_intrs, 1); 3968 3969 if (ibd_tx_softintr == 1) 3970 ddi_trigger_softintr(state->id_tx); 3971 else 3972 (void) ibd_tx_recycle((char *)state); 3973 } 3974 3975 /* 3976 * Multicast group create/delete trap handler. These will be delivered 3977 * on a kernel thread (handling can thus block) and can be invoked 3978 * concurrently. The handler can be invoked anytime after it is 3979 * registered and before ibt_detach(). 3980 */ 3981 /* ARGSUSED */ 3982 static void 3983 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3984 ibt_subnet_event_t *event) 3985 { 3986 ibd_state_t *state = (ibd_state_t *)arg; 3987 ibd_req_t *req; 3988 3989 /* 3990 * The trap handler will get invoked once for every event for 3991 * evert port. The input "gid" is the GID0 of the port the 3992 * trap came in on; we just need to act on traps that came 3993 * to our port, meaning the port on which the ipoib interface 3994 * resides. Since ipoib uses GID0 of the port, we just match 3995 * the gids to check whether we need to handle the trap. 3996 */ 3997 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3998 return; 3999 4000 DPRINT(10, "ibd_notices_handler : %d\n", code); 4001 4002 switch (code) { 4003 case IBT_SM_EVENT_UNAVAILABLE: 4004 /* 4005 * If we are in promiscuous mode or have 4006 * sendnonmembers, we need to print a warning 4007 * message right now. Else, just store the 4008 * information, print when we enter promiscuous 4009 * mode or attempt nonmember send. We might 4010 * also want to stop caching sendnonmember. 4011 */ 4012 ibd_print_warn(state, "IBA multicast support " 4013 "degraded due to unavailability of multicast " 4014 "traps"); 4015 break; 4016 case IBT_SM_EVENT_AVAILABLE: 4017 /* 4018 * If we printed a warning message above or 4019 * while trying to nonmember send or get into 4020 * promiscuous mode, print an okay message. 4021 */ 4022 ibd_print_warn(state, "IBA multicast support " 4023 "restored due to availability of multicast " 4024 "traps"); 4025 break; 4026 case IBT_SM_EVENT_MCG_CREATED: 4027 case IBT_SM_EVENT_MCG_DELETED: 4028 /* 4029 * Common processing of creation/deletion traps. 4030 * First check if the instance is being 4031 * [de]initialized; back off then, without doing 4032 * anything more, since we are not sure if the 4033 * async thread is around, or whether we might 4034 * be racing with the detach code in ibd_drv_fini() 4035 * that scans the mcg list. 4036 */ 4037 if (!ibd_async_safe(state)) 4038 return; 4039 4040 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4041 req->rq_gid = event->sm_notice_gid; 4042 req->rq_ptr = (void *)code; 4043 ibd_queue_work_slot(state, req, ASYNC_TRAP); 4044 break; 4045 } 4046 } 4047 4048 static void 4049 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4050 { 4051 ib_gid_t mgid = req->rq_gid; 4052 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4053 4054 DPRINT(10, "ibd_async_trap : %d\n", code); 4055 4056 /* 4057 * Atomically search the nonmember and sendonlymember lists and 4058 * delete. 4059 */ 4060 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4061 4062 if (state->id_prom_op == COMPLETED) { 4063 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4064 4065 /* 4066 * If in promiscuous mode, try to join/attach to the new 4067 * mcg. Given the unreliable out-of-order mode of trap 4068 * delivery, we can never be sure whether it is a problem 4069 * if the join fails. Thus, we warn the admin of a failure 4070 * if this was a creation trap. Note that the trap might 4071 * actually be reporting a long past event, and the mcg 4072 * might already have been deleted, thus we might be warning 4073 * in vain. 4074 */ 4075 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4076 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4077 ibd_print_warn(state, "IBA promiscuous mode missed " 4078 "new multicast gid %016llx:%016llx", 4079 (u_longlong_t)mgid.gid_prefix, 4080 (u_longlong_t)mgid.gid_guid); 4081 } 4082 4083 /* 4084 * Free the request slot allocated by the subnet event thread. 4085 */ 4086 ibd_async_done(state); 4087 } 4088 4089 /* 4090 * GLDv3 entry point to get capabilities. 4091 */ 4092 static boolean_t 4093 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4094 { 4095 _NOTE(ARGUNUSED(arg)); 4096 4097 switch (cap) { 4098 case MAC_CAPAB_HCKSUM: { 4099 uint32_t *txflags = cap_data; 4100 4101 if (ibd_csum_send > IBD_CSUM_NONE) 4102 *txflags = HCKSUM_INET_PARTIAL; 4103 else 4104 return (B_FALSE); 4105 break; 4106 } 4107 case MAC_CAPAB_POLL: 4108 /* 4109 * Fallthrough to default, as we don't support GLDv3 4110 * polling. When blanking is implemented, we will need to 4111 * change this to return B_TRUE in addition to registering 4112 * an mc_resources callback. 4113 */ 4114 default: 4115 return (B_FALSE); 4116 } 4117 return (B_TRUE); 4118 } 4119 4120 /* 4121 * GLDv3 entry point to start hardware. 4122 */ 4123 /* ARGSUSED */ 4124 static int 4125 ibd_m_start(void *arg) 4126 { 4127 return (0); 4128 } 4129 4130 /* 4131 * GLDv3 entry point to stop hardware from receiving packets. 4132 */ 4133 /* ARGSUSED */ 4134 static void 4135 ibd_m_stop(void *arg) 4136 { 4137 #ifdef RUN_PERFORMANCE 4138 ibd_perf((ibd_state_t *)arg); 4139 #endif 4140 } 4141 4142 /* 4143 * GLDv3 entry point to modify device's mac address. We do not 4144 * allow address modifications. 4145 */ 4146 static int 4147 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4148 { 4149 ibd_state_t *state; 4150 4151 state = (ibd_state_t *)arg; 4152 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4153 return (0); 4154 else 4155 return (EINVAL); 4156 } 4157 4158 /* 4159 * The blocking part of the IBA join/leave operations are done out 4160 * of here on the async thread. 4161 */ 4162 static void 4163 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4164 { 4165 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4166 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4167 4168 if (op == ASYNC_JOIN) { 4169 4170 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4171 ibd_print_warn(state, "Joint multicast group failed :" 4172 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4173 } 4174 } else { 4175 /* 4176 * Here, we must search for the proper mcg_info and 4177 * use that to leave the group. 4178 */ 4179 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4180 } 4181 } 4182 4183 /* 4184 * GLDv3 entry point for multicast enable/disable requests. 4185 * This function queues the operation to the async thread and 4186 * return success for a valid multicast address. 4187 */ 4188 static int 4189 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4190 { 4191 ibd_state_t *state = (ibd_state_t *)arg; 4192 ipoib_mac_t maddr, *mcast; 4193 ib_gid_t mgid; 4194 ibd_req_t *req; 4195 4196 /* 4197 * The incoming multicast address might not be aligned properly 4198 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4199 * it to look like one though, to get the offsets of the mc gid, 4200 * since we know we are not going to dereference any values with 4201 * the ipoib_mac_t pointer. 4202 */ 4203 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4204 mcast = &maddr; 4205 4206 /* 4207 * Check validity of MCG address. We could additionally check 4208 * that a enable/disable is not being issued on the "broadcast" 4209 * mcg, but since this operation is only invokable by priviledged 4210 * programs anyway, we allow the flexibility to those dlpi apps. 4211 * Note that we do not validate the "scope" of the IBA mcg. 4212 */ 4213 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4214 return (EINVAL); 4215 4216 /* 4217 * fill in multicast pkey and scope 4218 */ 4219 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4220 4221 /* 4222 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4223 * nothing (ie we stay JOINed to the broadcast group done in 4224 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4225 * requires to be joined to broadcast groups at all times. 4226 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4227 * depends on this. 4228 */ 4229 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4230 return (0); 4231 4232 ibd_n2h_gid(mcast, &mgid); 4233 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4234 if (req == NULL) 4235 return (ENOMEM); 4236 4237 req->rq_gid = mgid; 4238 4239 if (add) { 4240 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4241 mgid.gid_prefix, mgid.gid_guid); 4242 ibd_queue_work_slot(state, req, ASYNC_JOIN); 4243 } else { 4244 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4245 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4246 ibd_queue_work_slot(state, req, ASYNC_LEAVE); 4247 } 4248 return (0); 4249 } 4250 4251 /* 4252 * The blocking part of the IBA promiscuous operations are done 4253 * out of here on the async thread. The dlpireq parameter indicates 4254 * whether this invocation is due to a dlpi request or due to 4255 * a port up/down event. 4256 */ 4257 static void 4258 ibd_async_unsetprom(ibd_state_t *state) 4259 { 4260 ibd_mce_t *mce = list_head(&state->id_mc_non); 4261 ib_gid_t mgid; 4262 4263 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4264 4265 while (mce != NULL) { 4266 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4267 mce = list_next(&state->id_mc_non, mce); 4268 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4269 } 4270 state->id_prom_op = NOTSTARTED; 4271 } 4272 4273 /* 4274 * The blocking part of the IBA promiscuous operations are done 4275 * out of here on the async thread. The dlpireq parameter indicates 4276 * whether this invocation is due to a dlpi request or due to 4277 * a port up/down event. 4278 */ 4279 static void 4280 ibd_async_setprom(ibd_state_t *state) 4281 { 4282 ibt_mcg_attr_t mcg_attr; 4283 ibt_mcg_info_t *mcg_info; 4284 ib_gid_t mgid; 4285 uint_t numg; 4286 int i, ret = COMPLETED; 4287 4288 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4289 4290 /* 4291 * Obtain all active MC groups on the IB fabric with 4292 * specified criteria (scope + Pkey + Qkey + mtu). 4293 */ 4294 bzero(&mcg_attr, sizeof (mcg_attr)); 4295 mcg_attr.mc_pkey = state->id_pkey; 4296 mcg_attr.mc_scope = state->id_scope; 4297 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4298 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4299 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4300 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4301 IBT_SUCCESS) { 4302 ibd_print_warn(state, "Could not get list of IBA multicast " 4303 "groups"); 4304 ret = ERRORED; 4305 goto done; 4306 } 4307 4308 /* 4309 * Iterate over the returned mcg's and join as NonMember 4310 * to the IP mcg's. 4311 */ 4312 for (i = 0; i < numg; i++) { 4313 /* 4314 * Do a NonMember JOIN on the MC group. 4315 */ 4316 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4317 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4318 ibd_print_warn(state, "IBA promiscuous mode missed " 4319 "multicast gid %016llx:%016llx", 4320 (u_longlong_t)mgid.gid_prefix, 4321 (u_longlong_t)mgid.gid_guid); 4322 } 4323 4324 ibt_free_mcg_info(mcg_info, numg); 4325 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4326 done: 4327 state->id_prom_op = ret; 4328 } 4329 4330 /* 4331 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4332 * GLDv3 assumes phys state receives more packets than multi state, 4333 * which is not true for IPoIB. Thus, treat the multi and phys 4334 * promiscuous states the same way to work with GLDv3's assumption. 4335 */ 4336 static int 4337 ibd_m_promisc(void *arg, boolean_t on) 4338 { 4339 ibd_state_t *state = (ibd_state_t *)arg; 4340 ibd_req_t *req; 4341 4342 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4343 if (req == NULL) 4344 return (ENOMEM); 4345 if (on) { 4346 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4347 ibd_queue_work_slot(state, req, ASYNC_PROMON); 4348 } else { 4349 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4350 ibd_queue_work_slot(state, req, ASYNC_PROMOFF); 4351 } 4352 4353 return (0); 4354 } 4355 4356 /* 4357 * GLDv3 entry point for gathering statistics. 4358 */ 4359 static int 4360 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4361 { 4362 ibd_state_t *state = (ibd_state_t *)arg; 4363 4364 switch (stat) { 4365 case MAC_STAT_IFSPEED: 4366 *val = state->id_link_speed; 4367 break; 4368 case MAC_STAT_MULTIRCV: 4369 *val = state->id_multi_rcv; 4370 break; 4371 case MAC_STAT_BRDCSTRCV: 4372 *val = state->id_brd_rcv; 4373 break; 4374 case MAC_STAT_MULTIXMT: 4375 *val = state->id_multi_xmt; 4376 break; 4377 case MAC_STAT_BRDCSTXMT: 4378 *val = state->id_brd_xmt; 4379 break; 4380 case MAC_STAT_RBYTES: 4381 *val = state->id_recv_bytes; 4382 break; 4383 case MAC_STAT_IPACKETS: 4384 *val = state->id_rcv_pkt; 4385 break; 4386 case MAC_STAT_OBYTES: 4387 *val = state->id_xmt_bytes; 4388 break; 4389 case MAC_STAT_OPACKETS: 4390 *val = state->id_xmt_pkt; 4391 break; 4392 case MAC_STAT_NORCVBUF: 4393 *val = state->id_rx_short; /* # times below water mark */ 4394 break; 4395 case MAC_STAT_OERRORS: 4396 *val = state->id_ah_error; /* failed AH translation */ 4397 break; 4398 case MAC_STAT_IERRORS: 4399 *val = 0; 4400 break; 4401 case MAC_STAT_NOXMTBUF: 4402 *val = state->id_tx_short; 4403 break; 4404 default: 4405 return (ENOTSUP); 4406 } 4407 4408 return (0); 4409 } 4410 4411 /* 4412 * Tx reschedule 4413 */ 4414 static void 4415 ibd_async_txsched(ibd_state_t *state) 4416 { 4417 ibd_req_t *req; 4418 4419 /* 4420 * For poll mode, if ibd is out of Tx wqe, reschedule to collect 4421 * the CQEs. Otherwise, just return for out of Tx wqe. 4422 */ 4423 4424 if (ibd_txcomp_poll == 1) { 4425 mutex_enter(&state->id_txcomp_lock); 4426 ibd_poll_compq(state, state->id_scq_hdl); 4427 mutex_exit(&state->id_txcomp_lock); 4428 if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { 4429 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4430 ibd_queue_work_slot(state, req, ASYNC_SCHED); 4431 return; 4432 } 4433 } else if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { 4434 return; 4435 } 4436 4437 if (state->id_sched_needed) { 4438 mac_tx_update(state->id_mh); 4439 state->id_sched_needed = B_FALSE; 4440 } 4441 } 4442 4443 /* 4444 * Release one or more chained send wqes back into free list. 4445 */ 4446 static void 4447 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *swqe) 4448 { 4449 /* 4450 * Add back on Tx list for reuse. 4451 */ 4452 swqe->swqe_next = NULL; 4453 mutex_enter(&state->id_tx_list.dl_mutex); 4454 if (state->id_tx_list.dl_pending_sends) { 4455 state->id_tx_list.dl_pending_sends = B_FALSE; 4456 } 4457 if (state->id_tx_list.dl_head == NULL) { 4458 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 4459 } else { 4460 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 4461 } 4462 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 4463 state->id_tx_list.dl_cnt++; 4464 mutex_exit(&state->id_tx_list.dl_mutex); 4465 } 4466 4467 /* 4468 * Acquire send wqe from free list. 4469 * Returns error number and send wqe pointer. 4470 */ 4471 static int 4472 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **swqe) 4473 { 4474 int rc = 0; 4475 ibd_swqe_t *wqe; 4476 4477 /* 4478 * Check and reclaim some of the completed Tx requests. 4479 * If someone else is already in this code and pulling Tx 4480 * completions, no need to poll, since the current lock holder 4481 * will do the work anyway. Normally, we poll for completions 4482 * every few Tx attempts, but if we are short on Tx descriptors, 4483 * we always try to poll. 4484 */ 4485 if ((ibd_txcomp_poll == 1) && 4486 (state->id_tx_list.dl_cnt < IBD_TXPOLL_THRESHOLD) && 4487 (mutex_tryenter(&state->id_txcomp_lock) != 0)) { 4488 DPRINT(10, "ibd_send : polling"); 4489 ibd_poll_compq(state, state->id_scq_hdl); 4490 mutex_exit(&state->id_txcomp_lock); 4491 } 4492 4493 /* 4494 * Grab required transmit wqes. 4495 */ 4496 mutex_enter(&state->id_tx_list.dl_mutex); 4497 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 4498 if (wqe != NULL) { 4499 state->id_tx_list.dl_cnt -= 1; 4500 state->id_tx_list.dl_head = wqe->swqe_next; 4501 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 4502 state->id_tx_list.dl_tail = NULL; 4503 } else { 4504 /* 4505 * If we did not find the number we were looking for, flag 4506 * no resource. Adjust list appropriately in either case. 4507 */ 4508 rc = ENOENT; 4509 state->id_tx_list.dl_pending_sends = B_TRUE; 4510 DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); 4511 atomic_add_64(&state->id_tx_short, 1); 4512 } 4513 mutex_exit(&state->id_tx_list.dl_mutex); 4514 *swqe = wqe; 4515 4516 return (rc); 4517 } 4518 4519 /* 4520 * The passed in packet has this format: 4521 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 4522 */ 4523 static boolean_t 4524 ibd_send(ibd_state_t *state, mblk_t *mp) 4525 { 4526 ibt_status_t ibt_status; 4527 ibt_mr_attr_t mem_attr; 4528 ibd_ace_t *ace; 4529 ibd_swqe_t *node = NULL; 4530 ipoib_mac_t *dest; 4531 ibd_req_t *req; 4532 ib_header_info_t *ipibp; 4533 ip6_t *ip6h; 4534 mblk_t *nmp = mp; 4535 uint_t pktsize; 4536 size_t blksize; 4537 uchar_t *bufp; 4538 int i, ret, len, nmblks = 1; 4539 boolean_t dofree = B_TRUE; 4540 4541 if ((ret = ibd_acquire_swqes(state, &node)) != 0) { 4542 state->id_sched_needed = B_TRUE; 4543 if (ibd_txcomp_poll == 1) { 4544 goto ibd_send_fail; 4545 } 4546 return (B_FALSE); 4547 } 4548 4549 /* 4550 * Obtain an address handle for the destination. 4551 */ 4552 ipibp = (ib_header_info_t *)mp->b_rptr; 4553 dest = (ipoib_mac_t *)&ipibp->ib_dst; 4554 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 4555 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 4556 4557 pktsize = msgsize(mp); 4558 atomic_add_64(&state->id_xmt_bytes, pktsize); 4559 atomic_inc_64(&state->id_xmt_pkt); 4560 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4561 atomic_inc_64(&state->id_brd_xmt); 4562 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 4563 atomic_inc_64(&state->id_multi_xmt); 4564 4565 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 4566 node->w_ahandle = ace; 4567 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4568 } else { 4569 DPRINT(5, 4570 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 4571 ((ret == EFAULT) ? "failed" : "queued"), 4572 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 4573 htonl(dest->ipoib_gidpref[1]), 4574 htonl(dest->ipoib_gidsuff[0]), 4575 htonl(dest->ipoib_gidsuff[1])); 4576 node->w_ahandle = NULL; 4577 /* 4578 * for the poll mode, it is probably some cqe pending in the 4579 * cq. So ibd has to poll cq here, otherwise acache probably 4580 * may not be recycled. 4581 */ 4582 if (ibd_txcomp_poll == 1) { 4583 mutex_enter(&state->id_txcomp_lock); 4584 ibd_poll_compq(state, state->id_scq_hdl); 4585 mutex_exit(&state->id_txcomp_lock); 4586 } 4587 /* 4588 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 4589 * can not find a path for the specific dest address. We 4590 * should get rid of this kind of packet. With the normal 4591 * case, ibd will return the packet to upper layer and wait 4592 * for AH creating. 4593 */ 4594 if (ret == EFAULT) 4595 ret = B_TRUE; 4596 else { 4597 ret = B_FALSE; 4598 dofree = B_FALSE; 4599 state->id_sched_needed = B_TRUE; 4600 } 4601 goto ibd_send_fail; 4602 } 4603 4604 /* 4605 * For ND6 packets, padding is at the front of the source lladdr. 4606 * Insert the padding at front. 4607 */ 4608 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 4609 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 4610 if (!pullupmsg(mp, IPV6_HDR_LEN + 4611 sizeof (ib_header_info_t))) { 4612 DPRINT(10, "ibd_send: pullupmsg failure "); 4613 ret = B_TRUE; 4614 goto ibd_send_fail; 4615 } 4616 ipibp = (ib_header_info_t *)mp->b_rptr; 4617 } 4618 ip6h = (ip6_t *)((uchar_t *)ipibp + 4619 sizeof (ib_header_info_t)); 4620 len = ntohs(ip6h->ip6_plen); 4621 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 4622 mblk_t *pad; 4623 4624 pad = allocb(4, 0); 4625 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 4626 linkb(mp, pad); 4627 if (MBLKL(mp) < sizeof (ib_header_info_t) + 4628 IPV6_HDR_LEN + len + 4) { 4629 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 4630 IPV6_HDR_LEN + len + 4)) { 4631 DPRINT(10, "ibd_send: pullupmsg " 4632 "failure "); 4633 ret = B_TRUE; 4634 goto ibd_send_fail; 4635 } 4636 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 4637 sizeof (ib_header_info_t)); 4638 } 4639 4640 /* LINTED: E_CONSTANT_CONDITION */ 4641 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 4642 } 4643 } 4644 4645 mp->b_rptr += sizeof (ib_addrs_t); 4646 while (((nmp = nmp->b_cont) != NULL) && 4647 (++nmblks < (state->id_max_sqseg + 1))) 4648 ; 4649 4650 pktsize = msgsize(mp); 4651 /* 4652 * GLDv3 will check mtu. We do checksum related work here. 4653 */ 4654 IBD_CKSUM_SEND(mp); 4655 4656 /* 4657 * Copy the data to preregistered buffers, or register the buffer. 4658 */ 4659 if ((nmblks <= state->id_max_sqseg) && 4660 (pktsize > IBD_TX_COPY_THRESHOLD)) { 4661 for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { 4662 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; 4663 mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; 4664 mem_attr.mr_as = NULL; 4665 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4666 ibt_status = ibt_register_mr(state->id_hca_hdl, 4667 state->id_pd_hdl, &mem_attr, 4668 &node->w_smblkbuf[i].im_mr_hdl, 4669 &node->w_smblkbuf[i].im_mr_desc); 4670 if (ibt_status != IBT_SUCCESS) { 4671 /* 4672 * We do not expect any error other than 4673 * IBT_INSUFF_RESOURCE. 4674 */ 4675 if (ibt_status != IBT_INSUFF_RESOURCE) 4676 DPRINT(10, "ibd_send: %d\n", 4677 "failed in ibt_register_mem()", 4678 ibt_status); 4679 DPRINT(5, "ibd_send: registration failed"); 4680 node->w_swr.wr_nds = i; 4681 /* 4682 * Deregister already registered memory; 4683 * fallback to copying the mblk. 4684 */ 4685 ibd_deregister_mr(state, node); 4686 goto ibd_copy_path; 4687 } 4688 node->w_smblk_sgl[i].ds_va = 4689 (ib_vaddr_t)(uintptr_t)nmp->b_rptr; 4690 node->w_smblk_sgl[i].ds_key = 4691 node->w_smblkbuf[i].im_mr_desc.md_lkey; 4692 node->w_smblk_sgl[i].ds_len = 4693 nmp->b_wptr - nmp->b_rptr; 4694 } 4695 node->swqe_im_mblk = mp; 4696 node->w_swr.wr_sgl = node->w_smblk_sgl; 4697 node->w_swr.wr_nds = nmblks; 4698 dofree = B_FALSE; 4699 } else { 4700 ibd_copy_path: 4701 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 4702 node->w_swr.wr_nds = 1; 4703 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 4704 4705 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 4706 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 4707 blksize = MBLKL(nmp); 4708 bcopy(nmp->b_rptr, bufp, blksize); 4709 bufp += blksize; 4710 } 4711 } 4712 4713 /* 4714 * Queue the wqe to hardware. 4715 */ 4716 ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); 4717 if (ibt_status != IBT_SUCCESS) { 4718 /* 4719 * We should not fail here; but just in case we do, we 4720 * print out a warning to log. 4721 */ 4722 ibd_print_warn(state, "ibd_send: posting failed: %d", 4723 ibt_status); 4724 } 4725 4726 DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", 4727 INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), 4728 htonl(ace->ac_mac.ipoib_gidpref[0]), 4729 htonl(ace->ac_mac.ipoib_gidpref[1]), 4730 htonl(ace->ac_mac.ipoib_gidsuff[0]), 4731 htonl(ace->ac_mac.ipoib_gidsuff[1])); 4732 4733 if (dofree) 4734 freemsg(mp); 4735 4736 return (B_TRUE); 4737 4738 ibd_send_fail: 4739 if (state->id_sched_needed == B_TRUE) { 4740 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4741 if (req != NULL) 4742 ibd_queue_work_slot(state, req, ASYNC_SCHED); 4743 else { 4744 dofree = B_TRUE; 4745 ret = B_TRUE; 4746 } 4747 } 4748 4749 if (dofree) 4750 freemsg(mp); 4751 4752 if (node != NULL) 4753 ibd_tx_cleanup(state, node); 4754 4755 return (ret); 4756 } 4757 4758 /* 4759 * GLDv3 entry point for transmitting datagram. 4760 */ 4761 static mblk_t * 4762 ibd_m_tx(void *arg, mblk_t *mp) 4763 { 4764 ibd_state_t *state = (ibd_state_t *)arg; 4765 mblk_t *next; 4766 4767 while (mp != NULL) { 4768 next = mp->b_next; 4769 mp->b_next = NULL; 4770 if (!ibd_send(state, mp)) { 4771 /* Send fail */ 4772 mp->b_next = next; 4773 break; 4774 } 4775 mp = next; 4776 } 4777 4778 return (mp); 4779 } 4780 4781 /* 4782 * this handles Tx and Rx completions. With separate CQs, this handles 4783 * only Rx completions. 4784 */ 4785 static uint_t 4786 ibd_intr(char *arg) 4787 { 4788 ibd_state_t *state = (ibd_state_t *)arg; 4789 /* 4790 * Poll for completed entries; the CQ will not interrupt any 4791 * more for incoming (or transmitted) packets. 4792 */ 4793 ibd_poll_compq(state, state->id_rcq_hdl); 4794 4795 /* 4796 * Now enable CQ notifications; all packets that arrive now 4797 * (or complete transmission) will cause new interrupts. 4798 */ 4799 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 4800 IBT_SUCCESS) { 4801 /* 4802 * We do not expect a failure here. 4803 */ 4804 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 4805 } 4806 4807 /* 4808 * Repoll to catch all packets that might have arrived after 4809 * we finished the first poll loop and before interrupts got 4810 * armed. 4811 */ 4812 ibd_poll_compq(state, state->id_rcq_hdl); 4813 4814 return (DDI_INTR_CLAIMED); 4815 } 4816 4817 /* 4818 * Common code for interrupt handling as well as for polling 4819 * for all completed wqe's while detaching. 4820 */ 4821 static void 4822 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 4823 { 4824 ibd_wqe_t *wqe; 4825 ibt_wc_t *wc, *wcs; 4826 uint_t numwcs, real_numwcs; 4827 int i; 4828 4829 /* 4830 * In some cases (eg detaching), this code can be invoked on 4831 * any cpu after disabling cq notification (thus no concurrency 4832 * exists). Apart from that, the following applies normally: 4833 * The receive completion handling is always on the Rx interrupt 4834 * cpu. Transmit completion handling could be from any cpu if 4835 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 4836 * is interrupt driven. Combined completion handling is always 4837 * on the interrupt cpu. Thus, lock accordingly and use the 4838 * proper completion array. 4839 */ 4840 if (ibd_separate_cqs == 1) { 4841 if (cq_hdl == state->id_rcq_hdl) { 4842 wcs = state->id_rxwcs; 4843 numwcs = state->id_rxwcs_size; 4844 } else { 4845 wcs = state->id_txwcs; 4846 numwcs = state->id_txwcs_size; 4847 } 4848 } else { 4849 wcs = state->id_rxwcs; 4850 numwcs = state->id_rxwcs_size; 4851 } 4852 4853 if (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 4854 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 4855 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 4856 ASSERT((wqe->w_type == IBD_WQE_SEND) || 4857 (wqe->w_type == IBD_WQE_RECV)); 4858 if (wc->wc_status != IBT_WC_SUCCESS) { 4859 /* 4860 * Channel being torn down. 4861 */ 4862 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 4863 DPRINT(5, "ibd_intr: flush error"); 4864 /* 4865 * Only invoke the Tx handler to 4866 * release possibly held resources 4867 * like AH refcount etc. Can not 4868 * invoke Rx handler because it might 4869 * try adding buffers to the Rx pool 4870 * when we are trying to deinitialize. 4871 */ 4872 if (wqe->w_type == IBD_WQE_RECV) { 4873 continue; 4874 } else { 4875 DPRINT(10, "%s %d", 4876 "ibd_intr: Bad CQ status", 4877 wc->wc_status); 4878 } 4879 } 4880 } 4881 if (wqe->w_type == IBD_WQE_SEND) { 4882 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 4883 } else { 4884 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 4885 } 4886 } 4887 } 4888 } 4889 4890 /* 4891 * Deregister the mr associated with a given mblk. 4892 */ 4893 static void 4894 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) 4895 { 4896 int i; 4897 4898 DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, 4899 swqe->w_swr.wr_nds); 4900 4901 for (i = 0; i < swqe->w_swr.wr_nds; i++) { 4902 if (ibt_deregister_mr(state->id_hca_hdl, 4903 swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { 4904 /* 4905 * We do not expect any errors here. 4906 */ 4907 DPRINT(10, "failed in ibt_deregister_mem()\n"); 4908 } 4909 } 4910 } 4911 4912 /* 4913 * Common code that deals with clean ups after a successful or 4914 * erroneous transmission attempt. 4915 */ 4916 static void 4917 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 4918 { 4919 ibd_ace_t *ace = swqe->w_ahandle; 4920 4921 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 4922 4923 /* 4924 * If this was a dynamic registration in ibd_send(), 4925 * deregister now. 4926 */ 4927 if (swqe->swqe_im_mblk != NULL) { 4928 ibd_deregister_mr(state, swqe); 4929 freemsg(swqe->swqe_im_mblk); 4930 swqe->swqe_im_mblk = NULL; 4931 } 4932 4933 /* 4934 * Drop the reference count on the AH; it can be reused 4935 * now for a different destination if there are no more 4936 * posted sends that will use it. This can be eliminated 4937 * if we can always associate each Tx buffer with an AH. 4938 * The ace can be null if we are cleaning up from the 4939 * ibd_send() error path. 4940 */ 4941 if (ace != NULL) { 4942 /* 4943 * The recycling logic can be eliminated from here 4944 * and put into the async thread if we create another 4945 * list to hold ACE's for unjoined mcg's. 4946 */ 4947 if (DEC_REF_DO_CYCLE(ace)) { 4948 ibd_mce_t *mce; 4949 4950 /* 4951 * Check with the lock taken: we decremented 4952 * reference count without the lock, and some 4953 * transmitter might alreay have bumped the 4954 * reference count (possible in case of multicast 4955 * disable when we leave the AH on the active 4956 * list). If not still 0, get out, leaving the 4957 * recycle bit intact. 4958 * 4959 * Atomically transition the AH from active 4960 * to free list, and queue a work request to 4961 * leave the group and destroy the mce. No 4962 * transmitter can be looking at the AH or 4963 * the MCE in between, since we have the 4964 * ac_mutex lock. In the SendOnly reap case, 4965 * it is not neccesary to hold the ac_mutex 4966 * and recheck the ref count (since the AH was 4967 * taken off the active list), we just do it 4968 * to have uniform processing with the Full 4969 * reap case. 4970 */ 4971 mutex_enter(&state->id_ac_mutex); 4972 mce = ace->ac_mce; 4973 if (GET_REF_CYCLE(ace) == 0) { 4974 CLEAR_REFCYCLE(ace); 4975 /* 4976 * Identify the case of fullmember reap as 4977 * opposed to mcg trap reap. Also, port up 4978 * might set ac_mce to NULL to indicate Tx 4979 * cleanup should do no more than put the 4980 * AH in the free list (see ibd_async_link). 4981 */ 4982 if (mce != NULL) { 4983 ace->ac_mce = NULL; 4984 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 4985 /* 4986 * mc_req was initialized at mce 4987 * creation time. 4988 */ 4989 ibd_queue_work_slot(state, 4990 &mce->mc_req, ASYNC_REAP); 4991 } 4992 IBD_ACACHE_INSERT_FREE(state, ace); 4993 } 4994 mutex_exit(&state->id_ac_mutex); 4995 } 4996 } 4997 4998 /* 4999 * Release the send wqe for reuse. 5000 */ 5001 ibd_release_swqes(state, swqe); 5002 } 5003 5004 /* 5005 * Processing to be done after receipt of a packet; hand off to GLD 5006 * in the format expected by GLD. 5007 * The recvd packet has this format: 2b sap :: 00 :: data. 5008 */ 5009 static void 5010 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5011 { 5012 ib_header_info_t *phdr; 5013 mblk_t *mp; 5014 ipoib_hdr_t *ipibp; 5015 ip6_t *ip6h; 5016 int rxcnt, len; 5017 5018 /* 5019 * Track number handed to upper layer, and number still 5020 * available to receive packets. 5021 */ 5022 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5023 ASSERT(rxcnt >= 0); 5024 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5025 5026 /* 5027 * Adjust write pointer depending on how much data came in. 5028 */ 5029 mp = rwqe->rwqe_im_mblk; 5030 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5031 5032 /* 5033 * the IB link will deliver one of the IB link layer 5034 * headers called, the Global Routing Header (GRH). 5035 * ibd driver uses the information in GRH to build the 5036 * Header_info structure and pass it with the datagram up 5037 * to GLDv3. 5038 * If the GRH is not valid, indicate to GLDv3 by setting 5039 * the VerTcFlow field to 0. 5040 */ 5041 phdr = (ib_header_info_t *)mp->b_rptr; 5042 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 5043 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 5044 5045 /* if it is loop back packet, just drop it. */ 5046 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 5047 IPOIB_ADDRL) == 0) { 5048 freemsg(mp); 5049 return; 5050 } 5051 5052 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 5053 sizeof (ipoib_mac_t)); 5054 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 5055 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 5056 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 5057 } else { 5058 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 5059 } 5060 } else { 5061 /* 5062 * It can not be a IBA multicast packet. Must have been 5063 * unicast for us. Just copy the interface address to dst. 5064 */ 5065 phdr->ib_grh.ipoib_vertcflow = 0; 5066 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 5067 sizeof (ipoib_mac_t)); 5068 } 5069 5070 DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); 5071 5072 /* 5073 * For ND6 packets, padding is at the front of the source/target 5074 * lladdr. However the inet6 layer is not aware of it, hence remove 5075 * the padding from such packets. 5076 */ 5077 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5078 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5079 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5080 if (!pullupmsg(mp, IPV6_HDR_LEN + 5081 sizeof (ipoib_hdr_t))) { 5082 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5083 freemsg(mp); 5084 return; 5085 } 5086 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 5087 sizeof (ipoib_pgrh_t)); 5088 } 5089 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5090 len = ntohs(ip6h->ip6_plen); 5091 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5092 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5093 IPV6_HDR_LEN + len) { 5094 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5095 IPV6_HDR_LEN + len)) { 5096 DPRINT(10, "ibd_process_rx: pullupmsg" 5097 " failed"); 5098 freemsg(mp); 5099 return; 5100 } 5101 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5102 sizeof (ipoib_pgrh_t) + 5103 sizeof (ipoib_hdr_t)); 5104 } 5105 /* LINTED: E_CONSTANT_CONDITION */ 5106 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5107 } 5108 } 5109 5110 atomic_add_64(&state->id_recv_bytes, wc->wc_bytes_xfer); 5111 atomic_inc_64(&state->id_rcv_pkt); 5112 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5113 atomic_inc_64(&state->id_brd_rcv); 5114 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5115 atomic_inc_64(&state->id_multi_rcv); 5116 /* 5117 * Hand off to service thread/GLD. When we have hardware that 5118 * does hardware checksum, we will pull the checksum from the 5119 * work completion structure here. 5120 * on interrupt cpu. 5121 */ 5122 ibd_send_up(state, mp); 5123 5124 /* 5125 * Possibly replenish the Rx pool if needed. 5126 */ 5127 if (rxcnt < IBD_RX_THRESHOLD) { 5128 state->id_rx_short++; 5129 if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { 5130 if (ibd_post_rwqe(state, rwqe, B_FALSE) == 5131 DDI_FAILURE) { 5132 ibd_free_rwqe(state, rwqe); 5133 return; 5134 } 5135 } 5136 } 5137 } 5138 5139 /* 5140 * Callback code invoked from STREAMs when the recv data buffer is free 5141 * for recycling. 5142 */ 5143 static void 5144 ibd_freemsg_cb(char *arg) 5145 { 5146 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5147 ibd_state_t *state = rwqe->w_state; 5148 5149 /* 5150 * If the wqe is being destructed, do not attempt recycling. 5151 */ 5152 if (rwqe->w_freeing_wqe == B_TRUE) { 5153 DPRINT(6, "ibd_freemsg: wqe being freed"); 5154 return; 5155 } 5156 5157 /* 5158 * Upper layer has released held mblk. 5159 */ 5160 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5161 5162 if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { 5163 /* 5164 * There are already enough buffers on the Rx ring. 5165 * Free this one up. 5166 */ 5167 rwqe->rwqe_im_mblk = NULL; 5168 ibd_delete_rwqe(state, rwqe); 5169 ibd_free_rwqe(state, rwqe); 5170 DPRINT(6, "ibd_freemsg: free up wqe"); 5171 } else { 5172 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5173 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5174 if (rwqe->rwqe_im_mblk == NULL) { 5175 ibd_delete_rwqe(state, rwqe); 5176 ibd_free_rwqe(state, rwqe); 5177 DPRINT(6, "ibd_freemsg: desballoc failed"); 5178 return; 5179 } 5180 5181 /* 5182 * Post back to h/w. We could actually have more than 5183 * id_num_rwqe WQEs on the list if there were multiple 5184 * ibd_freemsg_cb() calls outstanding (since the lock is 5185 * not held the entire time). This will start getting 5186 * corrected over subsequent ibd_freemsg_cb() calls. 5187 */ 5188 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5189 ibd_delete_rwqe(state, rwqe); 5190 ibd_free_rwqe(state, rwqe); 5191 return; 5192 } 5193 } 5194 } 5195 5196 static uint_t 5197 ibd_tx_recycle(char *arg) 5198 { 5199 ibd_state_t *state = (ibd_state_t *)arg; 5200 5201 /* 5202 * Poll for completed entries; the CQ will not interrupt any 5203 * more for completed packets. 5204 */ 5205 ibd_poll_compq(state, state->id_scq_hdl); 5206 5207 /* 5208 * Now enable CQ notifications; all completions originating now 5209 * will cause new interrupts. 5210 */ 5211 if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != 5212 IBT_SUCCESS) { 5213 /* 5214 * We do not expect a failure here. 5215 */ 5216 DPRINT(10, "ibd_tx_recycle: ibt_enable_cq_notify() failed"); 5217 } 5218 5219 /* 5220 * Repoll to catch all packets that might have completed after 5221 * we finished the first poll loop and before interrupts got 5222 * armed. 5223 */ 5224 ibd_poll_compq(state, state->id_scq_hdl); 5225 5226 /* 5227 * Call txsched to notify GLDv3 if it required. 5228 */ 5229 ibd_async_txsched(state); 5230 5231 return (DDI_INTR_CLAIMED); 5232 } 5233 #ifdef RUN_PERFORMANCE 5234 5235 /* 5236 * To run the performance test, first do the "ifconfig ibdN plumb" on 5237 * the Rx and Tx side. Then use mdb -kw to tweak the following variables: 5238 * ibd_performance=1. 5239 * ibd_receiver=1 on Rx side. 5240 * ibd_sender=1 on Tx side. 5241 * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update 5242 * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will 5243 * make it drop into a 1 minute loop waiting for packets. An 5244 * ifconfig/unplumb on the Tx will cause it to send packets to Rx. 5245 */ 5246 5247 #define IBD_NUM_UNSIGNAL ibd_num_unsignal 5248 #define IBD_TX_PKTSIZE ibd_tx_pktsize 5249 #define IBD_TX_DATASIZE ibd_tx_datasize 5250 5251 static ibd_swqe_t **swqes; 5252 static ibt_wc_t *wcs; 5253 5254 /* 5255 * Set these on Rx and Tx side to do performance run. 5256 */ 5257 static int ibd_performance = 0; 5258 static int ibd_receiver = 0; 5259 static int ibd_sender = 0; 5260 static ipoib_mac_t ibd_dest; 5261 5262 /* 5263 * Interrupt coalescing is achieved by asking for a completion intr 5264 * only every ibd_num_unsignal'th packet. 5265 */ 5266 static int ibd_num_unsignal = 8; 5267 5268 /* 5269 * How big is each packet? 5270 */ 5271 static int ibd_tx_pktsize = 2048; 5272 5273 /* 5274 * Total data size to be transmitted. 5275 */ 5276 static int ibd_tx_datasize = 512*1024*1024; 5277 5278 static volatile boolean_t cq_handler_ran = B_FALSE; 5279 static volatile int num_completions; 5280 5281 /* ARGSUSED */ 5282 static void 5283 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) 5284 { 5285 ibd_state_t *state = (ibd_state_t *)arg; 5286 ibt_cq_hdl_t cqhdl; 5287 ibd_wqe_t *wqe; 5288 uint_t polled, i; 5289 boolean_t cq_enabled = B_FALSE; 5290 5291 if (ibd_receiver == 1) 5292 cqhdl = state->id_rcq_hdl; 5293 else 5294 cqhdl = state->id_scq_hdl; 5295 5296 /* 5297 * Mark the handler as having run and possibly freed up some 5298 * slots. Blocked sends can be retried. 5299 */ 5300 cq_handler_ran = B_TRUE; 5301 5302 repoll: 5303 while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == 5304 IBT_SUCCESS) { 5305 num_completions += polled; 5306 if (ibd_receiver == 1) { 5307 /* 5308 * We can immediately recycle the buffer. No 5309 * need to pass up to any IP layer ... 5310 */ 5311 for (i = 0; i < polled; i++) { 5312 wqe = (ibd_wqe_t *)wcs[i].wc_id; 5313 (void) ibt_post_recv(state->id_chnl_hdl, 5314 &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); 5315 } 5316 } 5317 } 5318 5319 /* 5320 * If we just repolled, we are done; exit. 5321 */ 5322 if (cq_enabled) 5323 return; 5324 5325 /* 5326 * Enable CQ. 5327 */ 5328 if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 5329 /* 5330 * We do not expect a failure here. 5331 */ 5332 cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); 5333 } 5334 cq_enabled = B_TRUE; 5335 5336 /* 5337 * Repoll for packets that came in after we finished previous 5338 * poll loop but before we turned on notifications. 5339 */ 5340 goto repoll; 5341 } 5342 5343 static void 5344 ibd_perf_tx(ibd_state_t *state) 5345 { 5346 ibt_mr_hdl_t mrhdl; 5347 ibt_mr_desc_t mrdesc; 5348 ibt_mr_attr_t mem_attr; 5349 ibt_status_t stat; 5350 ibd_ace_t *ace = NULL; 5351 ibd_swqe_t *node; 5352 uchar_t *sendbuf; 5353 longlong_t stime, etime; 5354 longlong_t sspin, espin, tspin = 0; 5355 int i, reps, packets; 5356 5357 cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", 5358 htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), 5359 htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), 5360 htonl(ibd_dest.ipoib_gidsuff[1])); 5361 if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || 5362 (ibd_dest.ipoib_gidpref[1] == 0)) { 5363 cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); 5364 return; 5365 } 5366 5367 packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); 5368 reps = (packets / IBD_NUM_SWQE); 5369 5370 cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); 5371 cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); 5372 cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); 5373 cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); 5374 cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); 5375 if ((packets % IBD_NUM_UNSIGNAL) != 0) { 5376 /* 5377 * This is required to ensure the last packet will trigger 5378 * a CQ handler callback, thus we can spin waiting fot all 5379 * packets to be received. 5380 */ 5381 cmn_err(CE_CONT, 5382 "ibd_perf_tx: #Packets not multiple of Signal Grp size"); 5383 return; 5384 } 5385 num_completions = 0; 5386 5387 swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, 5388 KM_NOSLEEP); 5389 if (swqes == NULL) { 5390 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5391 return; 5392 } 5393 5394 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5395 if (wcs == NULL) { 5396 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5397 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5398 return; 5399 } 5400 5401 /* 5402 * Get the ud_dest for the destination. 5403 */ 5404 ibd_async_acache(state, &ibd_dest); 5405 mutex_enter(&state->id_ac_mutex); 5406 ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); 5407 mutex_exit(&state->id_ac_mutex); 5408 if (ace == NULL) { 5409 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5410 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5411 cmn_err(CE_CONT, "ibd_perf_tx: no AH"); 5412 return; 5413 } 5414 5415 /* 5416 * Set up the send buffer. 5417 */ 5418 sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); 5419 if (sendbuf == NULL) { 5420 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5421 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5422 cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); 5423 return; 5424 } 5425 5426 /* 5427 * This buffer can be used in the case when we want to 5428 * send data from the same memory area over and over; 5429 * it might help in reducing memory traffic. 5430 */ 5431 mem_attr.mr_vaddr = (uint64_t)sendbuf; 5432 mem_attr.mr_len = IBD_TX_PKTSIZE; 5433 mem_attr.mr_as = NULL; 5434 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5435 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 5436 &mrhdl, &mrdesc) != IBT_SUCCESS) { 5437 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5438 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5439 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5440 cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); 5441 return; 5442 } 5443 5444 /* 5445 * Allocate private send wqe's. 5446 */ 5447 for (i = 0; i < IBD_NUM_SWQE; i++) { 5448 if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { 5449 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5450 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5451 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5452 cmn_err(CE_CONT, "ibd_alloc_swqe failure"); 5453 return; 5454 } 5455 node->w_ahandle = ace; 5456 #if 0 5457 node->w_smblkbuf[0].im_mr_hdl = mrhdl; 5458 node->w_smblkbuf[0].im_mr_desc = mrdesc; 5459 node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; 5460 node->w_smblk_sgl[0].ds_key = 5461 node->w_smblkbuf[0].im_mr_desc.md_lkey; 5462 node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; 5463 node->w_swr.wr_sgl = node->w_smblk_sgl; 5464 #else 5465 node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; 5466 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5467 #endif 5468 5469 /* 5470 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs 5471 * is marked to invoke the CQ handler. That is the only 5472 * way we come to know when the send queue can accept more 5473 * WRs. 5474 */ 5475 if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) 5476 node->w_swr.wr_flags = IBT_WR_NO_FLAGS; 5477 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5478 node->w_swr.wr_nds = 1; 5479 5480 swqes[i] = node; 5481 } 5482 5483 ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); 5484 5485 /* 5486 * Post all the requests. We expect this stream of post's will 5487 * not overwhelm the hardware due to periodic completions and 5488 * pollings that happen out of ibd_perf_handler. 5489 * Post a set of requests, till the channel can accept; after 5490 * that, wait for the CQ handler to notify us that there is more 5491 * space. 5492 */ 5493 stime = gethrtime(); 5494 for (; reps > 0; reps--) 5495 for (i = 0; i < IBD_NUM_SWQE; i++) { 5496 node = swqes[i]; 5497 retry: 5498 if ((stat = ibt_post_send(state->id_chnl_hdl, 5499 &node->w_swr, 1, NULL)) != IBT_SUCCESS) { 5500 if (stat == IBT_CHAN_FULL) { 5501 /* 5502 * Spin till the CQ handler runs 5503 * and then try again. 5504 */ 5505 sspin = gethrtime(); 5506 while (!cq_handler_ran) 5507 ; 5508 espin = gethrtime(); 5509 tspin += (espin - sspin); 5510 cq_handler_ran = B_FALSE; 5511 goto retry; 5512 } 5513 cmn_err(CE_CONT, "post failure %d/%d", stat, i); 5514 goto done; 5515 } 5516 } 5517 5518 done: 5519 /* 5520 * We should really be snapshotting when we get the last 5521 * completion. 5522 */ 5523 while (num_completions != (packets / IBD_NUM_UNSIGNAL)) 5524 ; 5525 etime = gethrtime(); 5526 5527 cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", 5528 num_completions); 5529 cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); 5530 cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); 5531 5532 /* 5533 * Wait a sec for everything to get over. 5534 */ 5535 delay(drv_usectohz(2000000)); 5536 5537 /* 5538 * Reset CQ handler to real one; free resources. 5539 */ 5540 if (ibd_separate_cqs == 0) { 5541 ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); 5542 } else { 5543 if (ibd_txcomp_poll == 0) 5544 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, 5545 state); 5546 else 5547 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5548 } 5549 5550 for (i = 0; i < IBD_NUM_SWQE; i++) 5551 ibd_free_swqe(state, swqes[i]); 5552 (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); 5553 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5554 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5555 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5556 } 5557 5558 static void 5559 ibd_perf_rx(ibd_state_t *state) 5560 { 5561 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5562 if (wcs == NULL) { 5563 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5564 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5565 return; 5566 } 5567 5568 /* 5569 * We do not need to allocate private recv wqe's. We will 5570 * just use the regular ones. 5571 */ 5572 5573 num_completions = 0; 5574 ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); 5575 5576 /* 5577 * Delay for a minute for all the packets to come in from 5578 * transmitter. 5579 */ 5580 cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); 5581 delay(drv_usectohz(60000000)); 5582 cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); 5583 5584 /* 5585 * Reset CQ handler to real one; free resources. 5586 */ 5587 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5588 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5589 } 5590 5591 static void 5592 ibd_perf(ibd_state_t *state) 5593 { 5594 if (ibd_performance == 0) 5595 return; 5596 5597 if (ibd_receiver == 1) { 5598 ibd_perf_rx(state); 5599 return; 5600 } 5601 5602 if (ibd_sender == 1) { 5603 ibd_perf_tx(state); 5604 return; 5605 } 5606 } 5607 5608 #endif /* RUN_PERFORMANCE */ 5609