1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * An implementation of the IPoIB standard based on PSARC 2001/289. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/conf.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/modctl.h> 37 #include <sys/stropts.h> 38 #include <sys/stream.h> 39 #include <sys/strsun.h> 40 #include <sys/strsubr.h> 41 #include <sys/dlpi.h> 42 43 #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/pattr.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Modes of hardware/driver/software checksum, useful for debugging 69 * and performance studies. 70 * 71 * none: h/w (Tavor) and driver does not do checksum, IP software must. 72 * partial: driver does data checksum, IP must provide psuedo header. 73 * perf_partial: driver uses IP provided psuedo cksum as data checksum 74 * (thus, real checksumming is not done). 75 */ 76 typedef enum { 77 IBD_CSUM_NONE, 78 IBD_CSUM_PARTIAL, 79 IBD_CSUM_PERF_PARTIAL 80 } ibd_csum_type_t; 81 82 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; 83 84 /* 85 * Per interface tunable parameters. 86 */ 87 static uint_t ibd_rx_threshold = 16; 88 static uint_t ibd_tx_current_copy_threshold = 0x10000000; 89 static uint_t ibd_num_rwqe = 4095; /* 1 less than max Tavor CQsize */ 90 static uint_t ibd_num_swqe = 4095; /* 1 less than max Tavor CQsize */ 91 static uint_t ibd_num_ah = 16; 92 static uint_t ibd_hash_size = 16; 93 static uint_t ibd_srv_fifos = 0xffff; 94 static uint_t ibd_fifo_depth = 0; 95 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; 96 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; 97 98 /* 99 * The driver can use separate CQs for send and receive queueus. 100 * While using separate CQs, it is possible to put the send CQ 101 * in polling mode, ie not to enable notifications on that CQ. 102 * If both CQs are interrupt driven, currently it is not possible 103 * for their handlers to be invoked concurrently (since Tavor ties 104 * both interrupts to the same PCI intr line); but the handlers 105 * are not coded with a single interrupt cpu assumption (eg 106 * id_num_intrs is incremented atomically). 107 * 108 * The driver private struct uses id_scq_hdl to track the separate 109 * CQ being used for send; the id_rcq_hdl tracks the receive CQ 110 * if using separate CQs, or it tracks the single CQ when using 111 * combined CQ. The id_wcs completion array is used in the combined 112 * CQ case, and for fetching Rx completions in the separate CQs case; 113 * the id_txwcs is used to fetch Tx completions in the separate CQs 114 * case. 115 */ 116 static uint_t ibd_separate_cqs = 1; 117 static uint_t ibd_txcomp_poll = 0; 118 119 /* 120 * Initial number of IBA resources allocated. 121 */ 122 #define IBD_NUM_RWQE ibd_num_rwqe 123 #define IBD_NUM_SWQE ibd_num_swqe 124 #define IBD_NUM_AH ibd_num_ah 125 126 /* when <= threshold, it's faster to copy to a premapped buffer */ 127 #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold 128 129 /* 130 * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will 131 * allocate a new WQE to put on the the rxlist. This value must be <= 132 * IBD_NUM_RWQE/id_num_rwqe. 133 */ 134 #define IBD_RX_THRESHOLD ibd_rx_threshold 135 136 /* 137 * Hash table size for the active AH list. 138 */ 139 #define IBD_HASH_SIZE ibd_hash_size 140 141 /* 142 * Size of completion array to be filled by a single poll call. 143 */ 144 #define IBD_WC_SIZE 16 145 146 /* 147 * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This 148 * is based on our above completion array size. 149 */ 150 #define IBD_TXPOLL_MASK 0xf 151 152 /* 153 * Number of payload areas the MDT code can support. Choose the same value 154 * that we know is supported by TCP/MDT. 155 */ 156 #define IBD_MDTMAX_SEGS 16 157 158 /* 159 * PAD routine called during send/recv context 160 */ 161 #define IBD_SEND 0 162 #define IBD_RECV 1 163 164 /* Driver State Pointer */ 165 void *ibd_list; 166 167 /* Required system entry points */ 168 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 169 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 170 171 /* Required driver entry points for GLD */ 172 static int ibd_reset(gld_mac_info_t *); 173 static int ibd_start(gld_mac_info_t *); 174 static int ibd_stop(gld_mac_info_t *); 175 static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *); 176 static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int); 177 static int ibd_set_promiscuous(gld_mac_info_t *, int); 178 static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *); 179 static int ibd_send(gld_mac_info_t *, mblk_t *); 180 static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **); 181 static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *); 182 static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *); 183 static uint_t ibd_intr(gld_mac_info_t *); 184 185 /* Private driver entry points for GLD */ 186 static int ibd_state_init(ibd_state_t *, dev_info_t *); 187 static void ibd_state_fini(ibd_state_t *); 188 static int ibd_drv_init(ibd_state_t *); 189 static void ibd_drv_fini(ibd_state_t *); 190 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 191 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 192 static void ibd_snet_notices_handler(void *, ib_gid_t, 193 ibt_subnet_event_code_t, ibt_subnet_event_t *); 194 static int ibd_init_txlist(ibd_state_t *); 195 static void ibd_fini_txlist(ibd_state_t *); 196 static int ibd_init_rxlist(ibd_state_t *); 197 static void ibd_fini_rxlist(ibd_state_t *); 198 static void ibd_freemsg_cb(char *); 199 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t); 200 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 201 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); 202 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 203 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 204 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 205 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 206 ibt_async_event_t *); 207 static int ibd_acache_init(ibd_state_t *); 208 static void ibd_acache_fini(ibd_state_t *); 209 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 210 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 211 static void ibd_async_unsetprom(ibd_state_t *, boolean_t); 212 static void ibd_async_setprom(ibd_state_t *, boolean_t); 213 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 214 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 215 static void ibd_async_txsched(ibd_state_t *); 216 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 217 static void ibd_async_work(ibd_state_t *); 218 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 219 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 220 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 221 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, 222 ipoib_mac_t *); 223 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 224 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); 225 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 226 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 227 static uint64_t ibd_get_portspeed(ibd_state_t *); 228 229 #ifdef RUN_PERFORMANCE 230 static void ibd_perf(ibd_state_t *); 231 #endif 232 233 /* Streams Module Info */ 234 static struct module_info ibd_minfo = { 235 IBD_IDNUM, /* module ID Number */ 236 "ibd", /* module name */ 237 0, /* min packet size */ 238 INFPSZ, /* maximum packet size */ 239 IBD_HIWAT, /* high water mark */ 240 IBD_LOWAT /* low water mark */ 241 }; 242 243 /* Streams Read Queue */ 244 static struct qinit ibd_rdinit = { 245 NULL, /* put */ 246 gld_rsrv, /* service */ 247 gld_open, /* open */ 248 gld_close, /* close */ 249 NULL, /* unused */ 250 &ibd_minfo, /* parameters */ 251 NULL /* statistics */ 252 }; 253 254 /* Streams Write Queue */ 255 static struct qinit ibd_wrinit = { 256 gld_wput, /* put */ 257 gld_wsrv, /* service */ 258 NULL, /* open */ 259 NULL, /* close */ 260 NULL, /* unused */ 261 &ibd_minfo, /* parameters */ 262 NULL /* statistics */ 263 }; 264 265 /* Stream Operations */ 266 static struct streamtab ibd_streamtab = { 267 &ibd_rdinit, /* read queue */ 268 &ibd_wrinit, /* write queue */ 269 NULL, /* lower read queue (MUX) */ 270 NULL /* lower write queue (MUX) */ 271 }; 272 273 /* Character/Block Operations */ 274 static struct cb_ops ibd_cb_ops = { 275 nulldev, /* open */ 276 nulldev, /* close */ 277 nodev, /* strategy (block) */ 278 nodev, /* print (block) */ 279 nodev, /* dump (block) */ 280 nodev, /* read */ 281 nodev, /* write */ 282 nodev, /* ioctl */ 283 nodev, /* devmap */ 284 nodev, /* mmap */ 285 nodev, /* segmap */ 286 nochpoll, /* chpoll */ 287 ddi_prop_op, /* prop_op */ 288 &ibd_streamtab, /* streams */ 289 D_MP | D_64BIT, /* flags */ 290 CB_REV /* rev */ 291 }; 292 293 /* Driver Operations */ 294 static struct dev_ops ibd_dev_ops = { 295 DEVO_REV, /* struct rev */ 296 0, /* refcnt */ 297 gld_getinfo, /* getinfo */ 298 nulldev, /* identify */ 299 nulldev, /* probe */ 300 ibd_attach, /* attach */ 301 ibd_detach, /* detach */ 302 nodev, /* reset */ 303 &ibd_cb_ops, /* cb_ops */ 304 NULL, /* bus_ops */ 305 nodev /* power */ 306 }; 307 308 /* Module Driver Info */ 309 static struct modldrv ibd_modldrv = { 310 &mod_driverops, 311 "InfiniBand DLPI Driver %I%", 312 &ibd_dev_ops 313 }; 314 315 /* Module Linkage */ 316 static struct modlinkage ibd_modlinkage = { 317 MODREV_1, 318 &ibd_modldrv, 319 NULL 320 }; 321 322 /* 323 * Module Info passed to IBTL during IBT_ATTACH. 324 * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this 325 * data). 326 */ 327 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 328 IBTI_V2, 329 IBT_NETWORK, 330 ibd_async_handler, 331 NULL, 332 "IPIB" 333 }; 334 335 /* 336 * Async operation types. 337 */ 338 #define ASYNC_GETAH 1 339 #define ASYNC_JOIN 2 340 #define ASYNC_LEAVE 3 341 #define ASYNC_PROMON 4 342 #define ASYNC_PROMOFF 5 343 #define ASYNC_REAP 6 344 #define ASYNC_POKE 7 345 #define ASYNC_TRAP 8 346 #define ASYNC_SCHED 9 347 #define ASYNC_LINK 10 348 #define ASYNC_EXIT 11 349 350 /* 351 * Async operation states 352 */ 353 #define NOTSTARTED 0 354 #define ONGOING 1 355 #define COMPLETED 2 356 #define ERRORED 3 357 #define ROUTERED 4 358 359 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF 360 361 #ifdef DEBUG 362 363 static int rxpack = 1, txpack = 1; 364 int debuglevel = 100; 365 static void 366 debug_print(int l, char *fmt, ...) 367 { 368 va_list ap; 369 370 if (l < debuglevel) 371 return; 372 va_start(ap, fmt); 373 vcmn_err(CE_CONT, fmt, ap); 374 va_end(ap); 375 } 376 #define INCRXPACK (rxpack++) 377 #define INCTXPACK (txpack++) 378 #define DPRINT debug_print 379 380 #else /* DEBUG */ 381 382 #define INCRXPACK 0 383 #define INCTXPACK 0 384 #define DPRINT 385 386 #endif /* DEBUG */ 387 388 /* 389 * Common routine to print warning messages; adds in hca guid, port number 390 * and pkey to be able to identify the IBA interface. 391 */ 392 static void 393 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 394 { 395 ib_guid_t hca_guid; 396 char ibd_print_buf[256]; 397 int len; 398 va_list ap; 399 400 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 401 0, "hca-guid", 0); 402 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 403 "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname, 404 state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid, 405 state->id_port, state->id_pkey); 406 va_start(ap, fmt); 407 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 408 fmt, ap); 409 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 410 va_end(ap); 411 } 412 413 /* warlock directives */ 414 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 415 ibd_state_t::id_ah_active)) 416 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) 417 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 418 ibd_state_t::id_req_list)) 419 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 420 ibd_state_t::id_acache_req_cv)) 421 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 422 ibd_state_t::id_multi_req)) 423 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 424 ibd_state_t::id_multi_addr)) 425 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 426 ibd_state_t::id_multi_op)) 427 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 428 ibd_state_t::id_multi_queued)) 429 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 430 ibd_state_t::id_mc_full)) 431 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 432 ibd_state_t::id_mc_non)) 433 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 434 ibd_state_t::id_link_state)) 435 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 436 ibd_state_s::id_tx_list)) 437 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 438 ibd_state_s::id_rx_list)) 439 440 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op)) 441 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) 442 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) 443 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) 444 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) 445 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) 446 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) 447 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) 448 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) 449 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) 450 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) 451 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) 452 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) 453 454 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) 455 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) 456 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) 457 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) 458 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) 459 460 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) 461 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) 462 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) 463 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) 464 465 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) 466 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) 467 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats)) 468 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) 469 470 #ifdef DEBUG 471 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) 472 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) 473 #endif 474 475 int 476 _init() 477 { 478 int status; 479 480 /* 481 * Sanity check some parameter settings. Tx completion polling 482 * only makes sense with separate CQs for Tx and Rx. 483 */ 484 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 485 cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname, 486 "Setting ibd_txcomp_poll = 0 for combined CQ"); 487 ibd_txcomp_poll = 0; 488 } 489 490 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 491 if (status != 0) { 492 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 493 return (status); 494 } 495 496 status = mod_install(&ibd_modlinkage); 497 if (status != 0) { 498 DPRINT(10, "_init:failed in mod_install()"); 499 ddi_soft_state_fini(&ibd_list); 500 return (status); 501 } 502 503 return (0); 504 } 505 506 int 507 _info(struct modinfo *modinfop) 508 { 509 return (mod_info(&ibd_modlinkage, modinfop)); 510 } 511 512 int 513 _fini() 514 { 515 int status; 516 517 status = mod_remove(&ibd_modlinkage); 518 if (status != 0) 519 return (status); 520 521 ddi_soft_state_fini(&ibd_list); 522 return (0); 523 } 524 525 /* 526 * Convert the GID part of the mac address from network byte order 527 * to host order. 528 */ 529 static void 530 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 531 { 532 ib_sn_prefix_t nbopref; 533 ib_guid_t nboguid; 534 535 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 536 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 537 dgid->gid_prefix = b2h64(nbopref); 538 dgid->gid_guid = b2h64(nboguid); 539 } 540 541 /* 542 * Create the IPoIB address in network byte order from host order inputs. 543 */ 544 static void 545 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 546 ib_guid_t guid) 547 { 548 ib_sn_prefix_t nbopref; 549 ib_guid_t nboguid; 550 551 mac->ipoib_qpn = htonl(qpn); 552 nbopref = h2b64(prefix); 553 nboguid = h2b64(guid); 554 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 555 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 556 } 557 558 /* 559 * Send to the appropriate all-routers group when the IBA multicast group 560 * does not exist, based on whether the target group is v4 or v6. 561 */ 562 static boolean_t 563 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 564 ipoib_mac_t *rmac) 565 { 566 boolean_t retval = B_TRUE; 567 uint32_t adjscope = state->id_scope << 16; 568 uint32_t topword; 569 570 /* 571 * Copy the first 4 bytes in without assuming any alignment of 572 * input mac address; this will have IPoIB signature, flags and 573 * scope bits. 574 */ 575 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 576 topword = ntohl(topword); 577 578 /* 579 * Generate proper address for IPv4/v6, adding in the Pkey properly. 580 */ 581 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 582 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 583 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 584 ((uint32_t)(state->id_pkey << 16))), 585 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 586 else 587 /* 588 * Does not have proper bits in the mgid address. 589 */ 590 retval = B_FALSE; 591 592 return (retval); 593 } 594 595 /* 596 * Implementation of various (software) flavors of send and receive side 597 * checksumming. 598 */ 599 #define IBD_CKSUM_SEND(mp) { \ 600 uint32_t start, stuff, end, value, flags; \ 601 uint32_t cksum, sum; \ 602 uchar_t *dp, *buf; \ 603 uint16_t *up; \ 604 \ 605 if (ibd_csum_send == IBD_CSUM_NONE) \ 606 goto punt_send; \ 607 \ 608 /* \ 609 * Query IP whether Tx cksum needs to be done. \ 610 */ \ 611 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ 612 &value, &flags); \ 613 \ 614 if (flags == HCK_PARTIALCKSUM) { \ 615 dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ 616 up = (uint16_t *)(dp + stuff); \ 617 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 618 end = ((uchar_t *)mp->b_wptr - dp - start); \ 619 cksum = *up; \ 620 *up = 0; \ 621 /* \ 622 * Does NOT handle chained mblks/more than one \ 623 * SGL. Applicable only for a single SGL \ 624 * entry/mblk, where the stuff offset is \ 625 * within the range of buf. \ 626 */ \ 627 buf = (dp + start); \ 628 sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ 629 } else { \ 630 sum = *up; \ 631 } \ 632 DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ 633 start, stuff, end, sum, cksum); \ 634 sum = ~(sum); \ 635 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 636 } \ 637 punt_send: \ 638 ; \ 639 } 640 641 #define IBD_CKSUM_RECV(mp) { \ 642 uchar_t *dp, *buf; \ 643 uint32_t start, end, value, stuff, flags; \ 644 uint16_t *up, frag; \ 645 ipha_t *iphp; \ 646 ipoib_hdr_t *ipibh; \ 647 \ 648 if (ibd_csum_recv == IBD_CSUM_NONE) \ 649 goto punt_recv; \ 650 \ 651 ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ 652 if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ 653 goto punt_recv; \ 654 \ 655 dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ 656 iphp = (ipha_t *)dp; \ 657 frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ 658 if ((frag) & (~IPH_DF)) \ 659 goto punt_recv; \ 660 start = IPH_HDR_LENGTH(iphp); \ 661 if (iphp->ipha_protocol == IPPROTO_TCP) \ 662 stuff = start + 16; \ 663 else if (iphp->ipha_protocol == IPPROTO_UDP) \ 664 stuff = start + 6; \ 665 else \ 666 goto punt_recv; \ 667 \ 668 flags = HCK_PARTIALCKSUM; \ 669 end = ntohs(iphp->ipha_length); \ 670 up = (uint16_t *)(dp + stuff); \ 671 \ 672 if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ 673 buf = (dp + start); \ 674 value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ 675 } else { \ 676 value = (*up); \ 677 } \ 678 if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ 679 value, flags, 0) != 0) \ 680 DPRINT(10, "cksum_recv: value: %x\n", value); \ 681 punt_recv: \ 682 ; \ 683 } 684 685 #define IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) { \ 686 /* \ 687 * Query IP whether Tx cksum needs to be done. \ 688 */ \ 689 if (ibd_csum_send != IBD_CSUM_NONE) \ 690 hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp); \ 691 } 692 693 #define IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) { \ 694 if ((ibd_csum_send != IBD_CSUM_NONE) && \ 695 (fl == HCK_PARTIALCKSUM)) { \ 696 extern uint_t bcksum(uchar_t *, int, uint32_t); \ 697 uint16_t *up; \ 698 uint32_t sum; \ 699 uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE; \ 700 int k; \ 701 \ 702 up = (uint16_t *)(hp + stf); \ 703 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 704 sum = *up; \ 705 *up = 0; \ 706 sum = IP_BCSUM_PARTIAL(hp + st, \ 707 PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE, \ 708 sum); \ 709 for (k = 0; k < pinfo->pld_cnt; k++) \ 710 sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\ 711 pld_rptr, PDESC_PLDL(pinfo, k), \ 712 sum); \ 713 } else { \ 714 sum = *up; \ 715 } \ 716 sum = ~(sum); \ 717 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 718 } \ 719 } 720 721 /* 722 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 723 * front of optional src/tgt link layer address. Right now Solaris inserts 724 * padding by default at the end. The routine which is doing is nce_xmit() 725 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 726 * the packet comes down from IP layer to the IBD driver, it is in the 727 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 728 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 729 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 730 * 731 * The send routine at IBD driver changes this packet as follows: 732 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 733 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 734 * aligned. 735 * 736 * At the receiving side again ibd_process_rx takes the above packet and 737 * removes the two bytes of front padding and inserts it at the end. This 738 * is since the IP layer does not understand padding at the front. 739 */ 740 #define IBD_PAD_NSNA(ip6h, len, type) { \ 741 uchar_t *nd_lla_ptr; \ 742 icmp6_t *icmp6; \ 743 nd_opt_hdr_t *opt; \ 744 int i; \ 745 \ 746 icmp6 = (icmp6_t *)&ip6h[1]; \ 747 len -= sizeof (nd_neighbor_advert_t); \ 748 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 749 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 750 (len != 0)) { \ 751 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 752 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 753 ASSERT(opt != NULL); \ 754 nd_lla_ptr = (uchar_t *)&opt[1]; \ 755 if (type == 0) { \ 756 for (i = IPOIB_ADDRL; i > 0; i--) \ 757 *(nd_lla_ptr + i + 1) = \ 758 *(nd_lla_ptr + i - 1); \ 759 } else { \ 760 for (i = 0; i < IPOIB_ADDRL; i++) \ 761 *(nd_lla_ptr + i) = \ 762 *(nd_lla_ptr + i + 2); \ 763 } \ 764 *(nd_lla_ptr + i) = 0; \ 765 *(nd_lla_ptr + i + 1) = 0; \ 766 } \ 767 } 768 769 /* 770 * The service fifo code is copied verbatim from Cassini. This can be 771 * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. 772 */ 773 774 typedef caddr_t fifo_obj_t, *p_fifo_obj_t; 775 776 typedef struct _srv_fifo_t { 777 kmutex_t fifo_lock; 778 kcondvar_t fifo_cv; 779 size_t size; 780 uint_t max_index; 781 uint_t rd_index; 782 uint_t wr_index; 783 uint_t objs_pending; 784 p_fifo_obj_t fifo_objs; 785 kthread_t *fifo_thread; 786 void (*drain_func)(caddr_t drain_func_arg); 787 caddr_t drain_func_arg; 788 boolean_t running; 789 callb_cpr_t cprinfo; 790 } srv_fifo_t, *p_srv_fifo_t; 791 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) 792 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) 793 794 static int 795 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, 796 void (*drain_func)(), caddr_t drain_func_arg) 797 { 798 int status; 799 p_srv_fifo_t srv_fifo; 800 801 status = DDI_SUCCESS; 802 srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); 803 srv_fifo->size = size; 804 srv_fifo->max_index = size - 1; 805 srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( 806 size * sizeof (fifo_obj_t), KM_SLEEP); 807 mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); 808 cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); 809 srv_fifo->drain_func = drain_func; 810 srv_fifo->drain_func_arg = drain_func_arg; 811 srv_fifo->running = DDI_SUCCESS; 812 srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, 813 (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); 814 if (srv_fifo->fifo_thread == NULL) { 815 cv_destroy(&srv_fifo->fifo_cv); 816 mutex_destroy(&srv_fifo->fifo_lock); 817 kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); 818 kmem_free(srv_fifo, sizeof (srv_fifo_t)); 819 srv_fifo = NULL; 820 status = DDI_FAILURE; 821 } else 822 *handle = srv_fifo; 823 return (status); 824 } 825 826 static void 827 _ddi_srv_fifo_destroy(p_srv_fifo_t handle) 828 { 829 kt_did_t tid = handle->fifo_thread->t_did; 830 831 mutex_enter(&handle->fifo_lock); 832 handle->running = DDI_FAILURE; 833 cv_signal(&handle->fifo_cv); 834 while (handle->running == DDI_FAILURE) 835 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 836 mutex_exit(&handle->fifo_lock); 837 if (handle->objs_pending != 0) 838 cmn_err(CE_NOTE, "!Thread Exit with work undone."); 839 cv_destroy(&handle->fifo_cv); 840 mutex_destroy(&handle->fifo_lock); 841 kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); 842 kmem_free(handle, sizeof (srv_fifo_t)); 843 thread_join(tid); 844 } 845 846 static caddr_t 847 _ddi_srv_fifo_begin(p_srv_fifo_t handle) 848 { 849 #ifndef __lock_lint 850 CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, 851 callb_generic_cpr, "srv_fifo"); 852 #endif /* ! _lock_lint */ 853 return (handle->drain_func_arg); 854 } 855 856 static void 857 _ddi_srv_fifo_end(p_srv_fifo_t handle) 858 { 859 callb_cpr_t cprinfo; 860 861 mutex_enter(&handle->fifo_lock); 862 cprinfo = handle->cprinfo; 863 handle->running = DDI_SUCCESS; 864 cv_signal(&handle->fifo_cv); 865 #ifndef __lock_lint 866 CALLB_CPR_EXIT(&cprinfo); 867 #endif /* ! _lock_lint */ 868 thread_exit(); 869 _NOTE(NOT_REACHED) 870 } 871 872 static int 873 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) 874 { 875 int status; 876 877 mutex_enter(&handle->fifo_lock); 878 status = handle->running; 879 if (status == DDI_SUCCESS) { 880 if (ptr) { 881 if (handle->objs_pending < handle->size) { 882 if (handle->wr_index == handle->max_index) 883 handle->wr_index = 0; 884 else 885 handle->wr_index++; 886 handle->fifo_objs[handle->wr_index] = ptr; 887 handle->objs_pending++; 888 } else 889 status = DDI_FAILURE; 890 if (signal) 891 cv_signal(&handle->fifo_cv); 892 } else { 893 if (signal && (handle->objs_pending > 0)) 894 cv_signal(&handle->fifo_cv); 895 } 896 } 897 mutex_exit(&handle->fifo_lock); 898 return (status); 899 } 900 901 static int 902 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) 903 { 904 int status; 905 906 mutex_enter(&handle->fifo_lock); 907 status = handle->running; 908 if (status == DDI_SUCCESS) { 909 if (handle->objs_pending == 0) { 910 #ifndef __lock_lint 911 CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); 912 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 913 CALLB_CPR_SAFE_END(&handle->cprinfo, 914 &handle->fifo_lock); 915 #endif /* !_lock_lint */ 916 *ptr = NULL; 917 } 918 if (handle->objs_pending > 0) { 919 if (handle->rd_index == handle->max_index) 920 handle->rd_index = 0; 921 else 922 handle->rd_index++; 923 *ptr = handle->fifo_objs[handle->rd_index]; 924 handle->objs_pending--; 925 } 926 status = handle->running; 927 } else { 928 if (handle->objs_pending) { 929 if (handle->rd_index == handle->max_index) 930 handle->rd_index = 0; 931 else 932 handle->rd_index++; 933 *ptr = handle->fifo_objs[handle->rd_index]; 934 handle->objs_pending--; 935 status = DDI_SUCCESS; 936 } else 937 status = DDI_FAILURE; 938 } 939 mutex_exit(&handle->fifo_lock); 940 return (status); 941 } 942 943 /* 944 * [un]map_rx_srv_fifos has been modified from its CE version. 945 */ 946 static void 947 drain_fifo(p_srv_fifo_t handle) 948 { 949 ibd_state_t *state; 950 mblk_t *mp; 951 952 state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); 953 while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { 954 /* 955 * Hand off to GLD. 956 */ 957 IBD_CKSUM_RECV(mp); 958 gld_recv(state->id_macinfo, mp); 959 } 960 _ddi_srv_fifo_end(handle); 961 } 962 963 static p_srv_fifo_t * 964 map_rx_srv_fifos(int *nfifos, void *private) 965 { 966 p_srv_fifo_t *srv_fifos; 967 int i, inst_taskqs, depth; 968 969 /* 970 * Default behavior on both sparc and amd cpus in terms of 971 * of worker thread is as follows: (N) indicates worker thread 972 * not enabled , (Y) indicates worker thread enabled. Default of 973 * ibd_srv_fifo is set to 0xffff. The default behavior can be 974 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below. 975 * Worker thread model assigns lower priority to network 976 * processing making system more usable at higher network 977 * loads. 978 * ________________________________________________________ 979 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff | 980 * |----------------------|---|---|-------|---|---|---------| 981 * | | Sparc | x86 | 982 * |----------------------|---|---|-------|---|---|---------| 983 * | Single CPU |N | Y | N | N | Y | N | 984 * |----------------------|---|---|-------|---|---|---------| 985 * | Multi CPU |N | Y | Y | N | Y | Y | 986 * |______________________|___|___|_______|___|___|_________| 987 */ 988 if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || 989 (ibd_srv_fifos == 0)) { 990 *nfifos = 0; 991 return ((p_srv_fifo_t *)1); 992 } 993 994 *nfifos = inst_taskqs; 995 srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), 996 KM_SLEEP); 997 998 /* 999 * If the administrator has specified a fifo depth, use 1000 * that, else just decide what should be the depth. 1001 */ 1002 if (ibd_fifo_depth == 0) 1003 depth = (IBD_NUM_RWQE / inst_taskqs) + 16; 1004 else 1005 depth = ibd_fifo_depth; 1006 1007 for (i = 0; i < inst_taskqs; i++) 1008 if (_ddi_srv_fifo_create(&srv_fifos[i], 1009 depth, drain_fifo, 1010 (caddr_t)private) != DDI_SUCCESS) 1011 break; 1012 1013 if (i < inst_taskqs) 1014 goto map_rx_srv_fifos_fail1; 1015 1016 goto map_rx_srv_fifos_exit; 1017 1018 map_rx_srv_fifos_fail1: 1019 i--; 1020 for (; i >= 0; i--) { 1021 _ddi_srv_fifo_destroy(srv_fifos[i]); 1022 } 1023 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1024 srv_fifos = NULL; 1025 1026 map_rx_srv_fifos_exit: 1027 return (srv_fifos); 1028 } 1029 1030 static void 1031 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) 1032 { 1033 int i; 1034 1035 /* 1036 * If this interface was not using service fifos, quickly return. 1037 */ 1038 if (inst_taskqs == 0) 1039 return; 1040 1041 for (i = 0; i < inst_taskqs; i++) { 1042 _ddi_srv_fifo_destroy(srv_fifos[i]); 1043 } 1044 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1045 } 1046 1047 /* 1048 * Choose between sending up the packet directly and handing off 1049 * to a service thread. 1050 */ 1051 static void 1052 ibd_send_up(ibd_state_t *state, mblk_t *mp) 1053 { 1054 p_srv_fifo_t *srvfifo; 1055 ipoib_hdr_t *lhdr; 1056 struct ip *ip_hdr; 1057 struct udphdr *tran_hdr; 1058 uchar_t prot; 1059 int tnum = -1, nfifos = state->id_nfifos; 1060 1061 /* 1062 * Quick path if the interface is not using service fifos. 1063 */ 1064 if (nfifos == 0) { 1065 hand_off: 1066 IBD_CKSUM_RECV(mp); 1067 gld_recv(state->id_macinfo, mp); 1068 return; 1069 } 1070 1071 /* 1072 * Is the packet big enough to look at the IPoIB header 1073 * and basic IP header to determine whether it is an 1074 * IPv4 packet? 1075 */ 1076 if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + 1077 sizeof (struct ip))) { 1078 1079 lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); 1080 1081 /* 1082 * Is the packet an IP(v4) packet? 1083 */ 1084 if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { 1085 1086 ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + 1087 IPOIB_HDRSIZE); 1088 prot = ip_hdr->ip_p; 1089 1090 /* 1091 * TCP or UDP packet? We use the UDP header, since 1092 * the first few words of both headers are laid out 1093 * similarly (src/dest ports). 1094 */ 1095 if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { 1096 1097 tran_hdr = (struct udphdr *)( 1098 (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); 1099 1100 /* 1101 * Are we within limits of this packet? If 1102 * so, use the destination port to hash to 1103 * a service thread. 1104 */ 1105 if (mp->b_wptr >= ((uchar_t *)tran_hdr + 1106 sizeof (*tran_hdr))) 1107 tnum = (ntohs(tran_hdr->uh_dport) + 1108 ntohs(tran_hdr->uh_sport)) % 1109 nfifos; 1110 } 1111 } 1112 } 1113 1114 /* 1115 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the 1116 * packet up in interrupt context, reducing latency. 1117 */ 1118 if (tnum == -1) { 1119 goto hand_off; 1120 } 1121 1122 srvfifo = (p_srv_fifo_t *)state->id_fifos; 1123 if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, 1124 B_TRUE) != DDI_SUCCESS) 1125 freemsg(mp); 1126 } 1127 1128 /* 1129 * Address handle entries maintained by the driver are kept in the 1130 * free and active lists. Each entry starts out in the free list; 1131 * it migrates to the active list when primed using ibt_get_paths() 1132 * and ibt_modify_ud_dest() for transmission to a specific destination. 1133 * In the active list, the entry has a reference count indicating the 1134 * number of ongoing/uncompleted transmits that reference it. The 1135 * entry is left in the active list even after the reference count 1136 * goes to 0, since successive transmits can find it there and do 1137 * not need to set up another entry (ie the path information is 1138 * cached using the active list). Entries on the active list are 1139 * also hashed using the destination link address as a key for faster 1140 * lookups during transmits. 1141 * 1142 * For any destination address (unicast or multicast, whatever the 1143 * join states), there will be at most one entry in the active list. 1144 * Entries with a 0 reference count on the active list can be reused 1145 * for a transmit to a new destination, if the free list is empty. 1146 * 1147 * The AH free list insertion/deletion is protected with the id_ac_mutex, 1148 * since the async thread and Tx callback handlers insert/delete. The 1149 * active list does not need a lock (all operations are done by the 1150 * async thread) but updates to the reference count are atomically 1151 * done (increments done by Tx path, decrements by the Tx callback handler). 1152 */ 1153 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 1154 list_insert_head(&state->id_ah_free, ce) 1155 #define IBD_ACACHE_GET_FREE(state) \ 1156 list_get_head(&state->id_ah_free) 1157 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 1158 int _ret_; \ 1159 list_insert_head(&state->id_ah_active, ce); \ 1160 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 1161 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1162 ASSERT(_ret_ == 0); \ 1163 } 1164 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 1165 list_remove(&state->id_ah_active, ce); \ 1166 (void) mod_hash_remove(state->id_ah_active_hash, \ 1167 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1168 } 1169 #define IBD_ACACHE_GET_ACTIVE(state) \ 1170 list_get_head(&state->id_ah_active) 1171 1172 /* 1173 * Membership states for different mcg's are tracked by two lists: 1174 * the "non" list is used for promiscuous mode, when all mcg traffic 1175 * needs to be inspected. This type of membership is never used for 1176 * transmission, so there can not be an AH in the active list 1177 * corresponding to a member in this list. This list does not need 1178 * any protection, since all operations are performed by the async 1179 * thread. 1180 * 1181 * "Full" and "SendOnly" membership is tracked using a single list, 1182 * the "full" list. This is because this single list can then be 1183 * searched during transmit to a multicast group (if an AH for the 1184 * mcg is not found in the active list), since at least one type 1185 * of membership must be present before initiating the transmit. 1186 * This list is also emptied during driver detach, since sendonly 1187 * membership acquired during transmit is dropped at detach time 1188 * alongwith ipv4 broadcast full membership. Insert/deletes to 1189 * this list are done only by the async thread, but it is also 1190 * searched in program context (see multicast disable case), thus 1191 * the id_mc_mutex protects the list. The driver detach path also 1192 * deconstructs the "full" list, but it ensures that the async 1193 * thread will not be accessing the list (by blocking out mcg 1194 * trap handling and making sure no more Tx reaping will happen). 1195 * 1196 * Currently, an IBA attach is done in the SendOnly case too, 1197 * although this is not required. 1198 */ 1199 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1200 list_insert_head(&state->id_mc_full, mce) 1201 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1202 list_insert_head(&state->id_mc_non, mce) 1203 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1204 ibd_mcache_find(mgid, &state->id_mc_full) 1205 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1206 ibd_mcache_find(mgid, &state->id_mc_non) 1207 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1208 list_remove(&state->id_mc_full, mce) 1209 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1210 list_remove(&state->id_mc_non, mce) 1211 1212 /* 1213 * AH and MCE active list manipulation: 1214 * 1215 * Multicast disable requests and MCG delete traps are two cases 1216 * where the active AH entry for the mcg (if any unreferenced one exists) 1217 * will be moved to the free list (to force the next Tx to the mcg to 1218 * join the MCG in SendOnly mode). Port up handling will also move AHs 1219 * from active to free list. 1220 * 1221 * In the case when some transmits are still pending on an entry 1222 * for an mcg, but a multicast disable has already been issued on the 1223 * mcg, there are some options to consider to preserve the join state 1224 * to ensure the emitted packet is properly routed on the IBA fabric. 1225 * For the AH, we can 1226 * 1. take out of active list at multicast disable time. 1227 * 2. take out of active list only when last pending Tx completes. 1228 * For the MCE, we can 1229 * 3. take out of active list at multicast disable time. 1230 * 4. take out of active list only when last pending Tx completes. 1231 * 5. move from active list to stale list at multicast disable time. 1232 * We choose to use 2,4. We use option 4 so that if a multicast enable 1233 * is tried before the pending Tx completes, the enable code finds the 1234 * mce in the active list and just has to make sure it will not be reaped 1235 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1236 * a stale list (#5) that would be checked in the enable code would need 1237 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1238 * after the multicast disable would try to put an AH in the active list, 1239 * and associate the mce it finds in the active list to this new AH, 1240 * whereas the mce is already associated with the previous AH (taken off 1241 * the active list), and will be removed once the pending Tx's complete 1242 * (unless a reference count on mce's is implemented). One implication of 1243 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1244 * grab new references on the AH, further delaying the leave. 1245 * 1246 * In the case of mcg delete (or create) trap when the port is sendonly 1247 * joined, the AH and MCE handling is different: the AH and MCE has to be 1248 * immediately taken off the active lists (forcing a join and path lookup 1249 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1250 * to an mcg as it is repeatedly created and deleted and goes thru 1251 * reincarnations). 1252 * 1253 * When a port is already sendonly joined, and a multicast enable is 1254 * attempted, the same mce structure is promoted; this ensures only a 1255 * single mce on the active list tracks the most powerful join state. 1256 * 1257 * In the case of port up event handling, the MCE for sendonly membership 1258 * is freed up, and the ACE is put into the free list as soon as possible 1259 * (depending on whether posted Tx's have completed). For fullmembership 1260 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1261 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1262 * done; else the mce is deconstructed (mc_fullreap case). 1263 * 1264 * MCG creation and deletion trap handling: 1265 * 1266 * These traps are unreliable (meaning sometimes the trap might never 1267 * be delivered to the subscribed nodes) and may arrive out-of-order 1268 * since they use UD transport. An alternative to relying on these 1269 * unreliable traps is to poll for mcg presence every so often, but 1270 * instead of doing that, we try to be as conservative as possible 1271 * while handling the traps, and hope that the traps do arrive at 1272 * the subscribed nodes soon. Note that if a node is fullmember 1273 * joined to an mcg, it can not possibly receive a mcg create/delete 1274 * trap for that mcg (by fullmember definition); if it does, it is 1275 * an old trap from a previous incarnation of the mcg. 1276 * 1277 * Whenever a trap is received, the driver cleans up its sendonly 1278 * membership to the group; we choose to do a sendonly leave even 1279 * on a creation trap to handle the case of a prior deletion of the mcg 1280 * having gone unnoticed. Consider an example scenario: 1281 * T1: MCG M is deleted, and fires off deletion trap D1. 1282 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1283 * T3: Node N tries to transmit to M, joining in sendonly mode. 1284 * T4: MCG M is deleted, and fires off deletion trap D2. 1285 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1286 * If the trap is D2, then a LEAVE is not required, since the mcg 1287 * is already deleted; but if it is D1, a LEAVE is required. A safe 1288 * approach is to always LEAVE, but the SM may be confused if it 1289 * receives a LEAVE without a prior JOIN. 1290 * 1291 * Management of the non-membership to an mcg is similar to the above, 1292 * except that if the interface is in promiscuous mode, it is required 1293 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1294 * if the re-join attempt fails (in which case a warning message needs 1295 * to be printed), it is not clear whether it failed due to the mcg not 1296 * existing, or some fabric/hca issues, due to the delayed nature of 1297 * trap delivery. Querying the SA to establish presence/absence of the 1298 * mcg is also racy at best. Thus, the driver just prints a warning 1299 * message when it can not rejoin after receiving a create trap, although 1300 * this might be (on rare occassions) a mis-warning if the create trap is 1301 * received after the mcg was deleted. 1302 */ 1303 1304 /* 1305 * Implementation of atomic "recycle" bits and reference count 1306 * on address handles. This utilizes the fact that max reference 1307 * count on any handle is limited by number of send wqes, thus 1308 * high bits in the ac_ref field can be used as the recycle bits, 1309 * and only the low bits hold the number of pending Tx requests. 1310 * This atomic AH reference counting allows the Tx completion 1311 * handler not to acquire the id_ac_mutex to process every completion, 1312 * thus reducing lock contention problems between completion and 1313 * the Tx path. 1314 */ 1315 #define CYCLEVAL 0x80000 1316 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1317 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1318 #define GET_REF(ace) ((ace)->ac_ref) 1319 #define GET_REF_CYCLE(ace) ( \ 1320 /* \ 1321 * Make sure "cycle" bit is set. \ 1322 */ \ 1323 ASSERT(CYCLE_SET(ace)), \ 1324 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1325 ) 1326 #define INC_REF(ace, num) { \ 1327 atomic_add_32(&(ace)->ac_ref, num); \ 1328 } 1329 #define SET_CYCLE_IF_REF(ace) ( \ 1330 CYCLE_SET(ace) ? B_TRUE : \ 1331 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1332 CYCLEVAL ? \ 1333 /* \ 1334 * Clear the "cycle" bit we just set; \ 1335 * ref count known to be 0 from above. \ 1336 */ \ 1337 CLEAR_REFCYCLE(ace), B_FALSE : \ 1338 /* \ 1339 * We set "cycle" bit; let caller know. \ 1340 */ \ 1341 B_TRUE \ 1342 ) 1343 #define DEC_REF_DO_CYCLE(ace) ( \ 1344 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1345 CYCLEVAL ? \ 1346 /* \ 1347 * Ref count known to be 0 from above. \ 1348 */ \ 1349 B_TRUE : \ 1350 B_FALSE \ 1351 ) 1352 1353 static void * 1354 list_get_head(list_t *list) 1355 { 1356 list_node_t *lhead = list_head(list); 1357 1358 if (lhead != NULL) 1359 list_remove(list, lhead); 1360 return (lhead); 1361 } 1362 1363 /* 1364 * This is always guaranteed to be able to queue the work. 1365 */ 1366 static void 1367 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1368 { 1369 /* Initialize request */ 1370 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1371 ptr->rq_op = op; 1372 1373 /* 1374 * Queue provided slot onto request pool. 1375 */ 1376 mutex_enter(&state->id_acache_req_lock); 1377 list_insert_tail(&state->id_req_list, ptr); 1378 1379 /* Go, fetch, async thread */ 1380 cv_signal(&state->id_acache_req_cv); 1381 mutex_exit(&state->id_acache_req_lock); 1382 } 1383 1384 /* 1385 * Main body of the per interface async thread. 1386 */ 1387 static void 1388 ibd_async_work(ibd_state_t *state) 1389 { 1390 ibd_req_t *ptr; 1391 callb_cpr_t cprinfo; 1392 1393 mutex_enter(&state->id_acache_req_lock); 1394 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1395 callb_generic_cpr, "ibd_async_work"); 1396 for (;;) { 1397 ptr = list_get_head(&state->id_req_list); 1398 if (ptr != NULL) { 1399 mutex_exit(&state->id_acache_req_lock); 1400 1401 /* 1402 * Once we have done the operation, there is no 1403 * guarantee the request slot is going to be valid, 1404 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). 1405 */ 1406 1407 /* Perform the request */ 1408 switch (ptr->rq_op) { 1409 case ASYNC_GETAH: 1410 ibd_async_acache(state, &ptr->rq_mac); 1411 break; 1412 case ASYNC_POKE: 1413 /* 1414 * We need the gld_sched; that 1415 * happens below. No locks are 1416 * needed for the multi_op update. 1417 */ 1418 state->id_multi_op = NOTSTARTED; 1419 break; 1420 case ASYNC_REAP: 1421 ibd_async_reap_group(state, 1422 ptr->rq_ptr, ptr->rq_gid, 1423 IB_MC_JSTATE_FULL); 1424 break; 1425 case ASYNC_LEAVE: 1426 case ASYNC_JOIN: 1427 ibd_async_multicast(state, 1428 ptr->rq_gid, ptr->rq_op); 1429 break; 1430 case ASYNC_PROMON: 1431 ibd_async_setprom(state, B_TRUE); 1432 break; 1433 case ASYNC_PROMOFF: 1434 ibd_async_unsetprom(state, B_TRUE); 1435 break; 1436 case ASYNC_TRAP: 1437 ibd_async_trap(state, ptr); 1438 break; 1439 case ASYNC_SCHED: 1440 ibd_async_txsched(state); 1441 break; 1442 case ASYNC_LINK: 1443 ibd_async_link(state, ptr); 1444 break; 1445 case ASYNC_EXIT: 1446 mutex_enter(&state->id_acache_req_lock); 1447 #ifndef __lock_lint 1448 CALLB_CPR_EXIT(&cprinfo); 1449 #endif /* !__lock_lint */ 1450 _NOTE(NOT_REACHED) 1451 return; 1452 } 1453 1454 /* 1455 * Indicate blocked operation can now be retried. 1456 * Note gld_sched() gets the gld_maclock, 1457 * and the multicast/promiscuous paths 1458 * (ibd_set_multicast(), ibd_set_promiscuous()) 1459 * grab id_acache_req_lock in ibd_queue_work_slot() 1460 * with gld_maclock held, so we must not hold the 1461 * id_acache_req_lock while calling gld_sched to 1462 * prevent deadlock. 1463 */ 1464 gld_sched(state->id_macinfo); 1465 1466 mutex_enter(&state->id_acache_req_lock); 1467 } else { 1468 /* 1469 * Nothing to do: wait till new request arrives. 1470 */ 1471 #ifndef __lock_lint 1472 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1473 cv_wait(&state->id_acache_req_cv, 1474 &state->id_acache_req_lock); 1475 CALLB_CPR_SAFE_END(&cprinfo, 1476 &state->id_acache_req_lock); 1477 #endif /* !_lock_lint */ 1478 } 1479 } 1480 /*NOTREACHED*/ 1481 } 1482 1483 /* 1484 * Return when it is safe to queue requests to the async daemon; primarily 1485 * for subnet trap and async event handling. Disallow requests before the 1486 * daemon is created, and when interface deinitilization starts. 1487 */ 1488 static boolean_t 1489 ibd_async_safe(ibd_state_t *state) 1490 { 1491 mutex_enter(&state->id_trap_lock); 1492 if (state->id_trap_stop) { 1493 mutex_exit(&state->id_trap_lock); 1494 return (B_FALSE); 1495 } 1496 state->id_trap_inprog++; 1497 mutex_exit(&state->id_trap_lock); 1498 return (B_TRUE); 1499 } 1500 1501 /* 1502 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1503 * trap or event handling to complete to kill the async thread and deconstruct 1504 * the mcg/ace list. 1505 */ 1506 static void 1507 ibd_async_done(ibd_state_t *state) 1508 { 1509 mutex_enter(&state->id_trap_lock); 1510 if (--state->id_trap_inprog == 0) 1511 cv_signal(&state->id_trap_cv); 1512 mutex_exit(&state->id_trap_lock); 1513 } 1514 1515 /* 1516 * Hash functions: 1517 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1518 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1519 * These operate on mac addresses input into ibd_send, but there is no 1520 * guarantee on the alignment of the ipoib_mac_t structure. 1521 */ 1522 /*ARGSUSED*/ 1523 static uint_t 1524 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1525 { 1526 ulong_t ptraddr = (ulong_t)key; 1527 uint_t hval; 1528 1529 /* 1530 * If the input address is 4 byte aligned, we can just dereference 1531 * it. This is most common, since IP will send in a 4 byte aligned 1532 * IP header, which implies the 24 byte IPoIB psuedo header will be 1533 * 4 byte aligned too. 1534 */ 1535 if ((ptraddr & 3) == 0) 1536 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1537 1538 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1539 return (hval); 1540 } 1541 1542 static int 1543 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1544 { 1545 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1546 return (0); 1547 else 1548 return (1); 1549 } 1550 1551 /* 1552 * Initialize all the per interface caches and lists; AH cache, 1553 * MCG list etc. 1554 */ 1555 static int 1556 ibd_acache_init(ibd_state_t *state) 1557 { 1558 ibd_ace_t *ce; 1559 int i; 1560 1561 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1562 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1563 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1564 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1565 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1566 offsetof(ibd_ace_t, ac_list)); 1567 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1568 offsetof(ibd_ace_t, ac_list)); 1569 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1570 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1571 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1572 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1573 offsetof(ibd_mce_t, mc_list)); 1574 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1575 offsetof(ibd_mce_t, mc_list)); 1576 list_create(&state->id_req_list, sizeof (ibd_req_t), 1577 offsetof(ibd_req_t, rq_list)); 1578 1579 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1580 IBD_NUM_AH, KM_SLEEP); 1581 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1582 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1583 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1584 ibd_acache_fini(state); 1585 return (DDI_FAILURE); 1586 } else { 1587 CLEAR_REFCYCLE(ce); 1588 ce->ac_mce = NULL; 1589 IBD_ACACHE_INSERT_FREE(state, ce); 1590 } 1591 } 1592 return (DDI_SUCCESS); 1593 } 1594 1595 static void 1596 ibd_acache_fini(ibd_state_t *state) 1597 { 1598 ibd_ace_t *ptr; 1599 1600 mutex_enter(&state->id_ac_mutex); 1601 1602 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1603 ASSERT(GET_REF(ptr) == 0); 1604 (void) ibt_free_ud_dest(ptr->ac_dest); 1605 } 1606 1607 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1608 ASSERT(GET_REF(ptr) == 0); 1609 (void) ibt_free_ud_dest(ptr->ac_dest); 1610 } 1611 1612 list_destroy(&state->id_ah_free); 1613 list_destroy(&state->id_ah_active); 1614 list_destroy(&state->id_mc_full); 1615 list_destroy(&state->id_mc_non); 1616 list_destroy(&state->id_req_list); 1617 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1618 mutex_exit(&state->id_ac_mutex); 1619 mutex_destroy(&state->id_ac_mutex); 1620 mutex_destroy(&state->id_mc_mutex); 1621 mutex_destroy(&state->id_acache_req_lock); 1622 cv_destroy(&state->id_acache_req_cv); 1623 } 1624 1625 /* 1626 * Search AH active hash list for a cached path to input destination. 1627 * If we are "just looking", hold == F. When we are in the Tx path, 1628 * we set hold == T to grab a reference on the AH so that it can not 1629 * be recycled to a new destination while the Tx request is posted. 1630 */ 1631 static ibd_ace_t * 1632 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1633 { 1634 ibd_ace_t *ptr; 1635 1636 ASSERT(mutex_owned(&state->id_ac_mutex)); 1637 1638 /* 1639 * Do hash search. 1640 */ 1641 if (mod_hash_find(state->id_ah_active_hash, 1642 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1643 if (hold) 1644 INC_REF(ptr, num); 1645 return (ptr); 1646 } 1647 return (NULL); 1648 } 1649 1650 /* 1651 * This is called by the tx side; if an initialized AH is found in 1652 * the active list, it is locked down and can be used; if no entry 1653 * is found, an async request is queued to do path resolution. 1654 */ 1655 static ibd_ace_t * 1656 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1657 { 1658 ibd_ace_t *ptr; 1659 1660 /* 1661 * Only attempt to print when we can; in the mdt pattr case, the 1662 * address is not aligned properly. 1663 */ 1664 if (((ulong_t)mac & 3) == 0) 1665 DPRINT(4, 1666 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1667 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1668 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1669 htonl(mac->ipoib_gidsuff[1])); 1670 1671 mutex_enter(&state->id_ac_mutex); 1672 1673 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1674 mutex_exit(&state->id_ac_mutex); 1675 return (ptr); 1676 } 1677 1678 /* 1679 * Implementation of a single outstanding async request; if 1680 * the operation is not started yet, queue a request and move 1681 * to ongoing state. Remember in id_ah_addr for which address 1682 * we are queueing the request, in case we need to flag an error; 1683 * Any further requests, for the same or different address, until 1684 * the operation completes, is sent back to GLD to be retried. 1685 * The async thread will update id_ah_op with an error indication 1686 * or will set it to indicate the next look up can start; either 1687 * way, it will gld_sched() so that all blocked requests come 1688 * back here. 1689 */ 1690 *err = GLD_NORESOURCES; 1691 if (state->id_ah_op == NOTSTARTED) { 1692 /* 1693 * We did not even find the entry; queue a request for it. 1694 */ 1695 bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL); 1696 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH); 1697 state->id_ah_op = ONGOING; 1698 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1699 } else if ((state->id_ah_op != ONGOING) && 1700 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1701 /* 1702 * Check the status of the pathrecord lookup request 1703 * we had queued before. 1704 */ 1705 if (state->id_ah_op == ERRORED) { 1706 *err = GLD_FAILURE; 1707 state->id_ah_error++; 1708 } else { 1709 /* 1710 * ROUTERED case: We need to send to the 1711 * all-router MCG. If we can find the AH for 1712 * the mcg, the Tx will be attempted. If we 1713 * do not find the AH, we return NORESOURCES 1714 * to retry. 1715 */ 1716 ipoib_mac_t routermac; 1717 1718 (void) ibd_get_allroutergroup(state, mac, &routermac); 1719 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1720 numwqe); 1721 } 1722 state->id_ah_op = NOTSTARTED; 1723 } 1724 mutex_exit(&state->id_ac_mutex); 1725 1726 /* 1727 * The PathRecord lookup failed; retry any other blocked 1728 * Tx requests that might have come in between when we 1729 * initiated the path lookup and now that were sent back 1730 * to GLD to implement single outstanding lookup scheme. 1731 */ 1732 if (*err == GLD_FAILURE) 1733 gld_sched(state->id_macinfo); 1734 return (ptr); 1735 } 1736 1737 /* 1738 * Grab a not-currently-in-use AH/PathRecord from the active 1739 * list to recycle to a new destination. Only the async thread 1740 * executes this code. 1741 */ 1742 static ibd_ace_t * 1743 ibd_acache_get_unref(ibd_state_t *state) 1744 { 1745 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1746 1747 ASSERT(mutex_owned(&state->id_ac_mutex)); 1748 1749 /* 1750 * Do plain linear search. 1751 */ 1752 while (ptr != NULL) { 1753 /* 1754 * Note that it is possible that the "cycle" bit 1755 * is set on the AH w/o any reference count. The 1756 * mcg must have been deleted, and the tx cleanup 1757 * just decremented the reference count to 0, but 1758 * hasn't gotten around to grabbing the id_ac_mutex 1759 * to move the AH into the free list. 1760 */ 1761 if (GET_REF(ptr) == 0) { 1762 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1763 break; 1764 } 1765 ptr = list_next(&state->id_ah_active, ptr); 1766 } 1767 return (ptr); 1768 } 1769 1770 /* 1771 * Invoked to clean up AH from active list in case of multicast 1772 * disable and to handle sendonly memberships during mcg traps. 1773 * And for port up processing for multicast and unicast AHs. 1774 * Normally, the AH is taken off the active list, and put into 1775 * the free list to be recycled for a new destination. In case 1776 * Tx requests on the AH have not completed yet, the AH is marked 1777 * for reaping (which will put the AH on the free list) once the Tx's 1778 * complete; in this case, depending on the "force" input, we take 1779 * out the AH from the active list right now, or leave it also for 1780 * the reap operation. Returns TRUE if the AH is taken off the active 1781 * list (and either put into the free list right now, or arranged for 1782 * later), FALSE otherwise. 1783 */ 1784 static boolean_t 1785 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1786 { 1787 ibd_ace_t *acactive; 1788 boolean_t ret = B_TRUE; 1789 1790 ASSERT(mutex_owned(&state->id_ac_mutex)); 1791 1792 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1793 1794 /* 1795 * Note that the AH might already have the cycle bit set 1796 * on it; this might happen if sequences of multicast 1797 * enables and disables are coming so fast, that posted 1798 * Tx's to the mcg have not completed yet, and the cycle 1799 * bit is set successively by each multicast disable. 1800 */ 1801 if (SET_CYCLE_IF_REF(acactive)) { 1802 if (!force) { 1803 /* 1804 * The ace is kept on the active list, further 1805 * Tx's can still grab a reference on it; the 1806 * ace is reaped when all pending Tx's 1807 * referencing the AH complete. 1808 */ 1809 ret = B_FALSE; 1810 } else { 1811 /* 1812 * In the mcg trap case, we always pull the 1813 * AH from the active list. And also the port 1814 * up multi/unicast case. 1815 */ 1816 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1817 acactive->ac_mce = NULL; 1818 } 1819 } else { 1820 /* 1821 * Determined the ref count is 0, thus reclaim 1822 * immediately after pulling out the ace from 1823 * the active list. 1824 */ 1825 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1826 acactive->ac_mce = NULL; 1827 IBD_ACACHE_INSERT_FREE(state, acactive); 1828 } 1829 1830 } 1831 return (ret); 1832 } 1833 1834 /* 1835 * Helper function for async path record lookup. If we are trying to 1836 * Tx to a MCG, check our membership, possibly trying to join the 1837 * group if required. If that fails, try to send the packet to the 1838 * all router group (indicated by the redirect output), pointing 1839 * the input mac address to the router mcg address. 1840 */ 1841 static ibd_mce_t * 1842 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1843 { 1844 ib_gid_t mgid; 1845 ibd_mce_t *mce; 1846 ipoib_mac_t routermac; 1847 1848 *redirect = B_FALSE; 1849 ibd_n2h_gid(mac, &mgid); 1850 1851 /* 1852 * Check the FullMember+SendOnlyNonMember list. 1853 * Since we are the only one who manipulates the 1854 * id_mc_full list, no locks are needed. 1855 */ 1856 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1857 if (mce != NULL) { 1858 DPRINT(4, "ibd_async_mcache : already joined to group"); 1859 return (mce); 1860 } 1861 1862 /* 1863 * Not found; try to join(SendOnlyNonMember) and attach. 1864 */ 1865 DPRINT(4, "ibd_async_mcache : not joined to group"); 1866 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1867 NULL) { 1868 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1869 return (mce); 1870 } 1871 1872 /* 1873 * MCGroup not present; try to join the all-router group. If 1874 * any of the following steps succeed, we will be redirecting 1875 * to the all router group. 1876 */ 1877 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1878 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1879 return (NULL); 1880 *redirect = B_TRUE; 1881 ibd_n2h_gid(&routermac, &mgid); 1882 bcopy(&routermac, mac, IPOIB_ADDRL); 1883 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1884 mgid.gid_prefix, mgid.gid_guid); 1885 1886 /* 1887 * Are we already joined to the router group? 1888 */ 1889 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1890 DPRINT(4, "ibd_async_mcache : using already joined router" 1891 "group\n"); 1892 return (mce); 1893 } 1894 1895 /* 1896 * Can we join(SendOnlyNonMember) the router group? 1897 */ 1898 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1899 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1900 NULL) { 1901 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1902 return (mce); 1903 } 1904 1905 return (NULL); 1906 } 1907 1908 /* 1909 * Async path record lookup code. 1910 */ 1911 static void 1912 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1913 { 1914 ibd_ace_t *ce; 1915 ibd_mce_t *mce = NULL; 1916 ibt_path_attr_t path_attr; 1917 ibt_path_info_t path_info; 1918 ib_gid_t destgid; 1919 int ret = NOTSTARTED; 1920 1921 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1922 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1923 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1924 htonl(mac->ipoib_gidsuff[1])); 1925 1926 /* 1927 * Check whether we are trying to transmit to a MCG. 1928 * In that case, we need to make sure we are a member of 1929 * the MCG. 1930 */ 1931 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1932 boolean_t redirected; 1933 1934 /* 1935 * If we can not find or join the group or even 1936 * redirect, error out. 1937 */ 1938 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1939 NULL) { 1940 state->id_ah_op = ERRORED; 1941 return; 1942 } 1943 1944 /* 1945 * If we got redirected, we need to determine whether 1946 * the AH for the new mcg is in the cache already, and 1947 * not pull it in then; otherwise proceed to get the 1948 * path for the new mcg. There is no guarantee that 1949 * if the AH is currently in the cache, it will still be 1950 * there when we look in ibd_acache_lookup(), but that's 1951 * okay, we will come back here. 1952 */ 1953 if (redirected) { 1954 ret = ROUTERED; 1955 DPRINT(4, "ibd_async_acache : redirected to " 1956 "%08X:%08X:%08X:%08X:%08X", 1957 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1958 htonl(mac->ipoib_gidpref[1]), 1959 htonl(mac->ipoib_gidsuff[0]), 1960 htonl(mac->ipoib_gidsuff[1])); 1961 1962 mutex_enter(&state->id_ac_mutex); 1963 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1964 mutex_exit(&state->id_ac_mutex); 1965 DPRINT(4, "ibd_async_acache : router AH found"); 1966 state->id_ah_op = ROUTERED; 1967 return; 1968 } 1969 mutex_exit(&state->id_ac_mutex); 1970 } 1971 } 1972 1973 /* 1974 * Get an AH from the free list. 1975 */ 1976 mutex_enter(&state->id_ac_mutex); 1977 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1978 /* 1979 * No free ones; try to grab an unreferenced active 1980 * one. Maybe we need to make the active list LRU, 1981 * but that will create more work for Tx callbacks. 1982 * Is there a way of not having to pull out the 1983 * entry from the active list, but just indicate it 1984 * is being recycled? Yes, but that creates one more 1985 * check in the fast lookup path. 1986 */ 1987 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1988 /* 1989 * Pretty serious shortage now. 1990 */ 1991 state->id_ah_op = NOTSTARTED; 1992 mutex_exit(&state->id_ac_mutex); 1993 DPRINT(10, "ibd_async_acache : failed to find AH " 1994 "slot\n"); 1995 return; 1996 } 1997 /* 1998 * We could check whether ac_mce points to a SendOnly 1999 * member and drop that membership now. Or do it lazily 2000 * at detach time. 2001 */ 2002 ce->ac_mce = NULL; 2003 } 2004 mutex_exit(&state->id_ac_mutex); 2005 ASSERT(ce->ac_mce == NULL); 2006 2007 /* 2008 * Update the entry. 2009 */ 2010 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 2011 2012 bzero(&path_info, sizeof (path_info)); 2013 bzero(&path_attr, sizeof (ibt_path_attr_t)); 2014 path_attr.pa_sgid = state->id_sgid; 2015 path_attr.pa_num_dgids = 1; 2016 ibd_n2h_gid(&ce->ac_mac, &destgid); 2017 path_attr.pa_dgids = &destgid; 2018 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2019 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2020 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 2021 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 2022 goto error; 2023 } 2024 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 2025 ntohl(ce->ac_mac.ipoib_qpn), 2026 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 2027 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 2028 goto error; 2029 } 2030 2031 /* 2032 * mce is set whenever an AH is being associated with a 2033 * MCG; this will come in handy when we leave the MCG. The 2034 * lock protects Tx fastpath from scanning the active list. 2035 */ 2036 if (mce != NULL) 2037 ce->ac_mce = mce; 2038 mutex_enter(&state->id_ac_mutex); 2039 IBD_ACACHE_INSERT_ACTIVE(state, ce); 2040 state->id_ah_op = ret; 2041 mutex_exit(&state->id_ac_mutex); 2042 return; 2043 error: 2044 /* 2045 * We might want to drop SendOnly membership here if we 2046 * joined above. The lock protects Tx callbacks inserting 2047 * into the free list. 2048 */ 2049 mutex_enter(&state->id_ac_mutex); 2050 state->id_ah_op = ERRORED; 2051 IBD_ACACHE_INSERT_FREE(state, ce); 2052 mutex_exit(&state->id_ac_mutex); 2053 } 2054 2055 /* 2056 * While restoring port's presence on the subnet on a port up, it is possible 2057 * that the port goes down again. 2058 */ 2059 static void 2060 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 2061 { 2062 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 2063 int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN : 2064 GLD_LINKSTATE_UP; 2065 ibd_mce_t *mce, *pmce; 2066 ibd_ace_t *ace, *pace; 2067 2068 DPRINT(10, "ibd_async_link(): %d", opcode); 2069 2070 /* 2071 * On a link up, revalidate the link speed/width. No point doing 2072 * this on a link down, since we will be unable to do SA operations, 2073 * defaulting to the lowest speed. Also notice that we update our 2074 * notion of speed before calling gld_linkstate(), which will do 2075 * neccesary higher level notifications for speed changes. 2076 */ 2077 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 2078 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2079 state->id_link_speed = ibd_get_portspeed(state); 2080 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2081 } 2082 2083 /* 2084 * Do all the work required to establish our presence on 2085 * the subnet. 2086 */ 2087 if (opcode == IBD_LINK_UP_ABSENT) { 2088 /* 2089 * If in promiscuous mode ... 2090 */ 2091 if (state->id_prom_op == COMPLETED) { 2092 /* 2093 * Drop all nonmembership. 2094 */ 2095 ibd_async_unsetprom(state, B_FALSE); 2096 2097 /* 2098 * Then, try to regain nonmembership to all mcg's. 2099 */ 2100 ibd_async_setprom(state, B_FALSE); 2101 2102 } 2103 2104 /* 2105 * Drop all sendonly membership (which also gets rid of the 2106 * AHs); try to reacquire all full membership. 2107 */ 2108 mce = list_head(&state->id_mc_full); 2109 while ((pmce = mce) != NULL) { 2110 mce = list_next(&state->id_mc_full, mce); 2111 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 2112 ibd_leave_group(state, 2113 pmce->mc_info.mc_adds_vect.av_dgid, 2114 IB_MC_JSTATE_SEND_ONLY_NON); 2115 else 2116 ibd_reacquire_group(state, pmce); 2117 } 2118 2119 /* 2120 * Recycle all active AHs to free list (and if there are 2121 * pending posts, make sure they will go into the free list 2122 * once the Tx's complete). Grab the lock to prevent 2123 * concurrent Tx's as well as Tx cleanups. 2124 */ 2125 mutex_enter(&state->id_ac_mutex); 2126 ace = list_head(&state->id_ah_active); 2127 while ((pace = ace) != NULL) { 2128 boolean_t cycled; 2129 2130 ace = list_next(&state->id_ah_active, ace); 2131 mce = pace->ac_mce; 2132 cycled = ibd_acache_recycle(state, &pace->ac_mac, 2133 B_TRUE); 2134 /* 2135 * If this is for an mcg, it must be for a fullmember, 2136 * since we got rid of send-only members above when 2137 * processing the mce list. 2138 */ 2139 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2140 IB_MC_JSTATE_FULL))); 2141 2142 /* 2143 * Check if the fullmember mce needs to be torn down, 2144 * ie whether the DLPI disable has already been done. 2145 * If so, do some of the work of tx_cleanup, namely 2146 * causing leave (which will fail), detach and 2147 * mce-freeing. tx_cleanup will put the AH into free 2148 * list. The reason to duplicate some of this 2149 * tx_cleanup work is because we want to delete the 2150 * AH right now instead of waiting for tx_cleanup, to 2151 * force subsequent Tx's to reacquire an AH. 2152 */ 2153 if ((mce != NULL) && (mce->mc_fullreap)) 2154 ibd_async_reap_group(state, mce, 2155 mce->mc_info.mc_adds_vect.av_dgid, 2156 mce->mc_jstate); 2157 } 2158 mutex_exit(&state->id_ac_mutex); 2159 } 2160 2161 /* 2162 * Macinfo is guaranteed to exist since driver does ibt_close_hca() 2163 * (which stops further events from being delivered) before 2164 * gld_mac_free(). At this point, it is guaranteed that gld_register 2165 * has already been done. 2166 */ 2167 mutex_enter(&state->id_link_mutex); 2168 state->id_link_state = lstate; 2169 gld_linkstate(state->id_macinfo, lstate); 2170 mutex_exit(&state->id_link_mutex); 2171 2172 /* 2173 * Free the request slot allocated by the event thread. 2174 */ 2175 kmem_free(req, sizeof (ibd_req_t)); 2176 2177 ibd_async_done(state); 2178 } 2179 2180 /* 2181 * When the link is notified up, we need to do a few things, based 2182 * on the port's current p_init_type_reply claiming a reinit has been 2183 * done or not. The reinit steps are: 2184 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2185 * the old Pkey and GID0 are correct. 2186 * 2. Register for mcg traps (already done by ibmf). 2187 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2188 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2189 * 4. Give up all sendonly memberships. 2190 * 5. Acquire all full memberships. 2191 * 6. In promiscuous mode, acquire all non memberships. 2192 * 7. Recycle all AHs to free list. 2193 */ 2194 static void 2195 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2196 { 2197 ibt_hca_portinfo_t *port_infop; 2198 ibt_status_t ibt_status; 2199 uint_t psize, port_infosz; 2200 ibd_link_op_t opcode; 2201 ibd_req_t *req; 2202 2203 /* 2204 * Do not send a request to the async daemon if it has not 2205 * yet been created or is being destroyed. If the async 2206 * daemon has not yet been created, we still need to track 2207 * last known state of the link. If this code races with the 2208 * detach path, then we are assured that the detach path has 2209 * not yet done the ibt_close_hca (which waits for all async 2210 * events to complete). If the code races with the attach path, 2211 * we need to validate the pkey/gid (in the link_up case) if 2212 * the initialization path has already set these up and created 2213 * IBTF resources based on the values. 2214 */ 2215 mutex_enter(&state->id_link_mutex); 2216 2217 /* 2218 * If the init code in ibd_drv_init hasn't yet set up the 2219 * pkey/gid, nothing to do; that code will set the link state. 2220 */ 2221 if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) { 2222 mutex_exit(&state->id_link_mutex); 2223 return; 2224 } 2225 2226 if (code == IBT_EVENT_PORT_UP) { 2227 uint8_t itreply; 2228 boolean_t badup = B_FALSE; 2229 2230 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2231 state->id_port, &port_infop, &psize, &port_infosz); 2232 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2233 mutex_exit(&state->id_link_mutex); 2234 DPRINT(10, "ibd_link_up : failed in" 2235 " ibt_query_port()\n"); 2236 return; 2237 } 2238 2239 /* 2240 * If the link already went down by the time the handler gets 2241 * here, give up; we can not even validate pkey/gid since those 2242 * are not valid. 2243 */ 2244 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2245 badup = B_TRUE; 2246 2247 itreply = port_infop->p_init_type_reply; 2248 2249 /* 2250 * In InitTypeReply, check if NoLoadReply == 2251 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2252 */ 2253 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2254 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2255 (!badup)) { 2256 /* 2257 * Check that the subnet part of GID0 has not changed. 2258 */ 2259 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2260 sizeof (ib_gid_t)) != 0) 2261 badup = B_TRUE; 2262 2263 /* 2264 * Check that Pkey/index mapping is still valid. 2265 */ 2266 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2267 (port_infop->p_pkey_tbl[state->id_pkix] != 2268 state->id_pkey)) 2269 badup = B_TRUE; 2270 } 2271 2272 /* 2273 * In InitTypeReply, if PreservePresenceReply indicates the SM 2274 * has ensured that the port's presence in mcg, traps etc is 2275 * intact, nothing more to do. 2276 */ 2277 opcode = IBD_LINK_UP_ABSENT; 2278 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2279 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2280 opcode = IBD_LINK_UP; 2281 2282 if (badup) 2283 code = IBT_ERROR_PORT_DOWN; 2284 ibt_free_portinfo(port_infop, port_infosz); 2285 } 2286 2287 if (!ibd_async_safe(state)) { 2288 state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? 2289 GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN); 2290 mutex_exit(&state->id_link_mutex); 2291 return; 2292 } 2293 mutex_exit(&state->id_link_mutex); 2294 2295 if (code == IBT_ERROR_PORT_DOWN) 2296 opcode = IBD_LINK_DOWN; 2297 2298 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 2299 req->rq_ptr = (void *)opcode; 2300 ibd_queue_work_slot(state, req, ASYNC_LINK); 2301 } 2302 2303 /* 2304 * For the port up/down events, IBTL guarantees there will not be concurrent 2305 * invocations of the handler. IBTL might coalesce link transition events, 2306 * and not invoke the handler for _each_ up/down transition, but it will 2307 * invoke the handler with last known state 2308 */ 2309 static void 2310 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2311 ibt_async_code_t code, ibt_async_event_t *event) 2312 { 2313 ibd_state_t *state = (ibd_state_t *)clnt_private; 2314 2315 switch (code) { 2316 case IBT_ERROR_CATASTROPHIC_CHAN: 2317 ibd_print_warn(state, "catastrophic channel error"); 2318 break; 2319 case IBT_ERROR_CQ: 2320 ibd_print_warn(state, "completion queue error"); 2321 break; 2322 case IBT_ERROR_PORT_DOWN: 2323 case IBT_EVENT_PORT_UP: 2324 /* 2325 * Events will be delivered to all instances that have 2326 * done ibt_open_hca() but not yet done ibt_close_hca(). 2327 * Only need to do work for our port; IBTF will deliver 2328 * events for other ports on the hca we have ibt_open_hca'ed 2329 * too. Note that ibd_drv_init() initializes id_port before 2330 * doing ibt_open_hca(). 2331 */ 2332 ASSERT(state->id_hca_hdl == hca_hdl); 2333 if (state->id_port != event->ev_port) 2334 break; 2335 2336 ibd_link_mod(state, code); 2337 break; 2338 2339 case IBT_HCA_ATTACH_EVENT: 2340 case IBT_HCA_DETACH_EVENT: 2341 /* 2342 * When a new card is plugged to the system, attach_event is 2343 * invoked. Additionally, a cfgadm needs to be run to make the 2344 * card known to the system, and an ifconfig needs to be run to 2345 * plumb up any ibd interfaces on the card. In the case of card 2346 * unplug, a cfgadm is run that will trigger any RCM scripts to 2347 * unplumb the ibd interfaces on the card; when the card is 2348 * actually unplugged, the detach_event is invoked; 2349 * additionally, if any ibd instances are still active on the 2350 * card (eg there were no associated RCM scripts), driver's 2351 * detach routine is invoked. 2352 */ 2353 break; 2354 default: 2355 break; 2356 } 2357 } 2358 2359 /* 2360 * Attach device to the IO framework. 2361 */ 2362 static int 2363 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2364 { 2365 ibd_state_t *state; 2366 int instance; 2367 2368 switch (cmd) { 2369 case DDI_ATTACH: 2370 break; 2371 case DDI_RESUME: 2372 /* This driver does not support resume */ 2373 default: 2374 return (DDI_FAILURE); 2375 } 2376 2377 /* 2378 * Allocate soft device data structure 2379 */ 2380 instance = ddi_get_instance(dip); 2381 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2382 return (DDI_FAILURE); 2383 state = ddi_get_soft_state(ibd_list, instance); 2384 2385 /* pre ibt_attach() soft state initialization */ 2386 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2387 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2388 goto attach_fail_state_init; 2389 } 2390 2391 /* "attach" to IBTL */ 2392 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2393 &state->id_ibt_hdl) != IBT_SUCCESS) { 2394 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2395 goto attach_fail_ibt_attach; 2396 } 2397 2398 /* Finish initializing this driver */ 2399 if (ibd_drv_init(state) != DDI_SUCCESS) { 2400 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2401 goto attach_fail_drv_init; 2402 } 2403 2404 /* 2405 * Register ourselves with the GLD interface 2406 * 2407 * gld_register will: 2408 * link us with the GLD module; 2409 * set our ddi_set_driver_private(9F) data to the macinfo ptr; 2410 * save the devinfo pointer in macinfo->gldm_devinfo; 2411 * create the minor device node. 2412 */ 2413 if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) { 2414 DPRINT(10, "ibd_attach : failed in gld_register()"); 2415 goto attach_fail_gld_register; 2416 } 2417 2418 /* 2419 * Setup the handler we will use for regular DLPI stuff. Its important 2420 * to setup the recv handler after registering with gld. Setting it 2421 * before causes at times an incoming packet to be forwarded to gld 2422 * before the gld_register. This will result in gld dropping the packet 2423 * which is ignored by ibd_rcq_handler, thus failing to re-arm the 2424 * tavor events. This will cause tavor_isr on recv path to be not 2425 * invoked any further. 2426 */ 2427 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2428 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2429 IBT_SUCCESS) { 2430 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2431 goto attach_fail_gld_register; 2432 } 2433 2434 /* 2435 * Setup the subnet notices handler after we initialize the a/mcaches 2436 * and start the async thread, both of which are required for the 2437 * trap handler to function properly. Enable the trap handler to 2438 * queue requests to the async thread after the gld_register, because 2439 * the async daemon invokes gld_sched(), which must be done after 2440 * gld_register(). 2441 */ 2442 ibt_register_subnet_notices(state->id_ibt_hdl, 2443 ibd_snet_notices_handler, state); 2444 mutex_enter(&state->id_trap_lock); 2445 state->id_trap_stop = B_FALSE; 2446 mutex_exit(&state->id_trap_lock); 2447 2448 /* 2449 * Indicate link status to GLD and higher layers. By default, 2450 * we assume we are in up state (which must have been true at 2451 * least at the time the broadcast mcg's were probed); if there 2452 * were any up/down transitions till the time we come here, the 2453 * async handler will have updated last known state, which we 2454 * use to tell GLD. The async handler will not send any 2455 * notifications to GLD till we reach here in the initialization 2456 * sequence. 2457 */ 2458 mutex_enter(&state->id_link_mutex); 2459 gld_linkstate(state->id_macinfo, state->id_link_state); 2460 mutex_exit(&state->id_link_mutex); 2461 2462 return (DDI_SUCCESS); 2463 2464 /* Attach failure points, cleanup */ 2465 attach_fail_gld_register: 2466 ibd_drv_fini(state); 2467 2468 attach_fail_drv_init: 2469 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2470 ibd_print_warn(state, "failed to free IB resources"); 2471 2472 attach_fail_ibt_attach: 2473 ibd_state_fini(state); 2474 2475 attach_fail_state_init: 2476 ddi_soft_state_free(ibd_list, instance); 2477 2478 return (DDI_FAILURE); 2479 } 2480 2481 /* 2482 * Detach device from the IO framework. 2483 */ 2484 static int 2485 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2486 { 2487 ibd_state_t *state; 2488 int status; 2489 int instance; 2490 2491 switch (cmd) { 2492 case DDI_DETACH: 2493 break; 2494 case DDI_SUSPEND: 2495 default: 2496 return (DDI_FAILURE); 2497 } 2498 2499 instance = ddi_get_instance(dip); 2500 state = ddi_get_soft_state(ibd_list, instance); 2501 2502 /* 2503 * First, stop receive interrupts; this stops the 2504 * driver from handing up buffers to higher layers. 2505 * Wait for receive buffers to be returned; give up 2506 * after 5 seconds. 2507 */ 2508 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2509 status = 50; 2510 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2511 delay(drv_usectohz(100000)); 2512 if (--status == 0) { 2513 DPRINT(2, "ibd_detach : reclaiming failed"); 2514 goto failed; 2515 } 2516 } 2517 2518 if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) { 2519 DPRINT(10, "ibd_detach : failed in gld_unregister()"); 2520 goto failed; 2521 } 2522 2523 ibd_drv_fini(state); 2524 2525 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2526 ibd_print_warn(state, "failed to free all IB resources at " 2527 "driver detach time"); 2528 2529 ibd_state_fini(state); 2530 ddi_soft_state_free(ibd_list, instance); 2531 return (DDI_SUCCESS); 2532 2533 failed: 2534 /* 2535 * Reap all the Tx/Rx completions that were posted since we 2536 * turned off the notification. Turn on notifications. There 2537 * is a race in that we do not reap completions that come in 2538 * after the poll and before notifications get turned on. That 2539 * is okay, the next rx/tx packet will trigger a completion 2540 * that will reap any missed completions. 2541 */ 2542 ibd_poll_compq(state, state->id_rcq_hdl); 2543 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2544 return (DDI_FAILURE); 2545 } 2546 2547 /* 2548 * Pre ibt_attach() driver initialization 2549 */ 2550 static int 2551 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2552 { 2553 gld_mac_info_t *macinfo; 2554 2555 if ((macinfo = gld_mac_alloc(dip)) == NULL) { 2556 DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()"); 2557 return (DDI_FAILURE); 2558 } 2559 2560 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2561 state->id_link_state = GLD_LINKSTATE_UNKNOWN; 2562 2563 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2564 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2565 state->id_trap_stop = B_TRUE; 2566 state->id_trap_inprog = 0; 2567 2568 mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); 2569 state->id_dip = dip; 2570 state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2571 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2572 2573 state->id_sched_queued = B_FALSE; 2574 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2575 2576 state->id_tx_list.dl_head = NULL; 2577 state->id_tx_list.dl_tail = NULL; 2578 state->id_tx_list.dl_pending_sends = B_FALSE; 2579 state->id_tx_list.dl_cnt = 0; 2580 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2581 2582 state->id_rx_list.dl_head = NULL; 2583 state->id_rx_list.dl_tail = NULL; 2584 state->id_rx_list.dl_bufs_outstanding = 0; 2585 state->id_rx_list.dl_cnt = 0; 2586 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2587 2588 /* Link up various structs for later access */ 2589 macinfo->gldm_private = (caddr_t)state; 2590 state->id_macinfo = macinfo; 2591 2592 /* 2593 * Initialize pointers to device specific functions which will be 2594 * used by the generic layer. 2595 */ 2596 macinfo->gldm_reset = ibd_reset; 2597 macinfo->gldm_start = ibd_start; 2598 macinfo->gldm_stop = ibd_stop; 2599 macinfo->gldm_set_mac_addr = ibd_set_mac_addr; 2600 macinfo->gldm_set_multicast = ibd_set_multicast; 2601 macinfo->gldm_set_promiscuous = ibd_set_promiscuous; 2602 macinfo->gldm_get_stats = ibd_get_stats; 2603 macinfo->gldm_send = ibd_send; 2604 macinfo->gldm_intr = ibd_intr; 2605 macinfo->gldm_mdt_pre = ibd_mdt_pre; 2606 macinfo->gldm_mdt_send = ibd_mdt_txone; 2607 macinfo->gldm_mdt_post = ibd_mdt_post; 2608 macinfo->gldm_mdt_sgl = state->id_max_sqseg; 2609 macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS; 2610 2611 /* Initialize board characteristics needed by the generic layer. */ 2612 macinfo->gldm_ident = "InfiniBand DLPI Driver"; 2613 macinfo->gldm_type = DL_IB; 2614 macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */ 2615 macinfo->gldm_addrlen = IPOIB_ADDRL; 2616 macinfo->gldm_saplen = -2; 2617 macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; 2618 2619 /* Other required initialization */ 2620 macinfo->gldm_ppa = ddi_get_instance(dip); 2621 macinfo->gldm_devinfo = dip; 2622 2623 return (DDI_SUCCESS); 2624 } 2625 2626 /* 2627 * Post ibt_detach() driver deconstruction 2628 */ 2629 static void 2630 ibd_state_fini(ibd_state_t *state) 2631 { 2632 mutex_destroy(&state->id_tx_list.dl_mutex); 2633 mutex_destroy(&state->id_rx_list.dl_mutex); 2634 mutex_destroy(&state->id_sched_lock); 2635 mutex_destroy(&state->id_txcomp_lock); 2636 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2637 kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2638 cv_destroy(&state->id_trap_cv); 2639 mutex_destroy(&state->id_trap_lock); 2640 mutex_destroy(&state->id_link_mutex); 2641 gld_mac_free(state->id_macinfo); 2642 } 2643 2644 /* 2645 * Fetch IBA parameters for the network device from IB nexus. 2646 */ 2647 static int 2648 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2649 { 2650 /* 2651 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2652 * Note that the default partition is also allowed. 2653 */ 2654 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2655 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2656 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2657 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2658 "partition\n"); 2659 return (DDI_FAILURE); 2660 } 2661 2662 /* 2663 * ... the IBA port ... 2664 */ 2665 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2666 0, "port-number", 0); 2667 if (state->id_port == 0) { 2668 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2669 return (DDI_FAILURE); 2670 } 2671 2672 /* 2673 * ... and HCA GUID. 2674 */ 2675 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2676 0, "hca-guid", 0); 2677 if (*hca_guid == 0) { 2678 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2679 "guid\n"); 2680 return (DDI_FAILURE); 2681 } 2682 2683 return (DDI_SUCCESS); 2684 } 2685 2686 /* 2687 * Fetch link speed from SA for snmp ifspeed reporting. 2688 */ 2689 static uint64_t 2690 ibd_get_portspeed(ibd_state_t *state) 2691 { 2692 int ret; 2693 uint64_t ifspeed; 2694 size_t length; 2695 ib_lid_t lid; 2696 sa_portinfo_record_t req, *resp = NULL; 2697 ibmf_saa_access_args_t args; 2698 ibmf_saa_handle_t saa_handle; 2699 2700 /* 2701 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2702 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2703 * 2000000000. Start with that as default. 2704 */ 2705 ifspeed = 2000000000; 2706 2707 /* Get port lid */ 2708 if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL, 2709 &lid) != IBT_SUCCESS) 2710 goto earlydone; 2711 2712 if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL, 2713 IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS) 2714 goto earlydone; 2715 2716 /* Contact SA Access */ 2717 bzero(&req, sizeof (sa_portinfo_record_t)); 2718 req.EndportLID = lid; 2719 2720 args.sq_attr_id = SA_PORTINFORECORD_ATTRID; 2721 args.sq_access_type = IBMF_SAA_RETRIEVE; 2722 args.sq_component_mask = SA_PORTINFO_COMPMASK_PORTLID; 2723 args.sq_template = &req; 2724 args.sq_callback = NULL; 2725 args.sq_callback_arg = NULL; 2726 2727 ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp); 2728 if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL)) 2729 goto done; 2730 2731 /* 2732 * 4X/12X needs appropriate multipliers. With IBA 1.2 additions, 2733 * double and quad multipliers are also needed per LinkSpeedEnabled. 2734 * In case SA does not return an expected value, report the default 2735 * speed as 1X. 2736 */ 2737 ret = 1; 2738 switch (resp->PortInfo.LinkWidthActive) { 2739 case SM_LINK_WIDTH_ACTIVE_1X: 2740 ret = 1; 2741 break; 2742 case SM_LINK_WIDTH_ACTIVE_4X: 2743 ret = 4; 2744 break; 2745 case SM_LINK_WIDTH_ACTIVE_12X: 2746 ret = 12; 2747 break; 2748 } 2749 ifspeed *= ret; 2750 kmem_free(resp, length); 2751 2752 done: 2753 (void) ibmf_sa_session_close(&saa_handle, 0); 2754 2755 earlydone: 2756 return (ifspeed); 2757 } 2758 2759 /* 2760 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2761 * representing the input mcg mgid. 2762 */ 2763 static ibd_mce_t * 2764 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2765 { 2766 ibd_mce_t *ptr = list_head(mlist); 2767 2768 /* 2769 * Do plain linear search. 2770 */ 2771 while (ptr != NULL) { 2772 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2773 sizeof (ib_gid_t)) == 0) 2774 return (ptr); 2775 ptr = list_next(mlist, ptr); 2776 } 2777 return (NULL); 2778 } 2779 2780 /* 2781 * Execute IBA JOIN. 2782 */ 2783 static ibt_status_t 2784 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2785 { 2786 ibt_mcg_attr_t mcg_attr; 2787 2788 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2789 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2790 mcg_attr.mc_mgid = mgid; 2791 mcg_attr.mc_join_state = mce->mc_jstate; 2792 mcg_attr.mc_scope = state->id_scope; 2793 mcg_attr.mc_pkey = state->id_pkey; 2794 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2795 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2796 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2797 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2798 NULL, NULL)); 2799 } 2800 2801 /* 2802 * This code JOINs the port in the proper way (depending on the join 2803 * state) so that IBA fabric will forward mcg packets to/from the port. 2804 * It also attaches the QPN to the mcg so it can receive those mcg 2805 * packets. This code makes sure not to attach the mcg to the QP if 2806 * that has been previously done due to the mcg being joined with a 2807 * different join state, even though this is not required by SWG_0216, 2808 * refid 3610. 2809 */ 2810 static ibd_mce_t * 2811 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2812 { 2813 ibt_status_t ibt_status; 2814 ibd_mce_t *mce, *tmce, *omce = NULL; 2815 boolean_t do_attach = B_TRUE; 2816 2817 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2818 jstate, mgid.gid_prefix, mgid.gid_guid); 2819 2820 /* 2821 * For enable_multicast Full member joins, we need to do some 2822 * extra work. If there is already an mce on the list that 2823 * indicates full membership, that means the membership has 2824 * not yet been dropped (since the disable_multicast was issued) 2825 * because there are pending Tx's to the mcg; in that case, just 2826 * mark the mce not to be reaped when the Tx completion queues 2827 * an async reap operation. 2828 * 2829 * If there is already an mce on the list indicating sendonly 2830 * membership, try to promote to full membership. Be careful 2831 * not to deallocate the old mce, since there might be an AH 2832 * pointing to it; instead, update the old mce with new data 2833 * that tracks the full membership. 2834 */ 2835 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2836 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2837 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2838 ASSERT(omce->mc_fullreap); 2839 omce->mc_fullreap = B_FALSE; 2840 return (omce); 2841 } else { 2842 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2843 } 2844 } 2845 2846 /* 2847 * Allocate the ibd_mce_t to track this JOIN. 2848 */ 2849 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2850 mce->mc_fullreap = B_FALSE; 2851 mce->mc_jstate = jstate; 2852 2853 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2854 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2855 ibt_status); 2856 kmem_free(mce, sizeof (ibd_mce_t)); 2857 return (NULL); 2858 } 2859 2860 /* 2861 * Is an IBA attach required? Not if the interface is already joined 2862 * to the mcg in a different appropriate join state. 2863 */ 2864 if (jstate == IB_MC_JSTATE_NON) { 2865 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2866 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2867 do_attach = B_FALSE; 2868 } else if (jstate == IB_MC_JSTATE_FULL) { 2869 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2870 do_attach = B_FALSE; 2871 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2872 do_attach = B_FALSE; 2873 } 2874 2875 if (do_attach) { 2876 /* 2877 * Do the IBA attach. 2878 */ 2879 DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); 2880 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2881 &mce->mc_info)) != IBT_SUCCESS) { 2882 DPRINT(10, "ibd_join_group : failed qp attachment " 2883 "%d\n", ibt_status); 2884 /* 2885 * NOTE that we should probably preserve the join info 2886 * in the list and later try to leave again at detach 2887 * time. 2888 */ 2889 (void) ibt_leave_mcg(state->id_sgid, mgid, 2890 state->id_sgid, jstate); 2891 kmem_free(mce, sizeof (ibd_mce_t)); 2892 return (NULL); 2893 } 2894 } 2895 2896 /* 2897 * Insert the ibd_mce_t in the proper list. 2898 */ 2899 if (jstate == IB_MC_JSTATE_NON) { 2900 IBD_MCACHE_INSERT_NON(state, mce); 2901 } else { 2902 /* 2903 * Set up the mc_req fields used for reaping the 2904 * mcg in case of delayed tx completion (see 2905 * ibd_tx_cleanup()). Also done for sendonly join in 2906 * case we are promoted to fullmembership later and 2907 * keep using the same mce. 2908 */ 2909 mce->mc_req.rq_gid = mgid; 2910 mce->mc_req.rq_ptr = mce; 2911 /* 2912 * Check whether this is the case of trying to join 2913 * full member, and we were already joined send only. 2914 * We try to drop our SendOnly membership, but it is 2915 * possible that the mcg does not exist anymore (and 2916 * the subnet trap never reached us), so the leave 2917 * operation might fail. 2918 */ 2919 if (omce != NULL) { 2920 (void) ibt_leave_mcg(state->id_sgid, mgid, 2921 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2922 omce->mc_jstate = IB_MC_JSTATE_FULL; 2923 bcopy(&mce->mc_info, &omce->mc_info, 2924 sizeof (ibt_mcg_info_t)); 2925 kmem_free(mce, sizeof (ibd_mce_t)); 2926 return (omce); 2927 } 2928 mutex_enter(&state->id_mc_mutex); 2929 IBD_MCACHE_INSERT_FULL(state, mce); 2930 mutex_exit(&state->id_mc_mutex); 2931 } 2932 2933 return (mce); 2934 } 2935 2936 /* 2937 * Called during port up event handling to attempt to reacquire full 2938 * membership to an mcg. Stripped down version of ibd_join_group(). 2939 * Note that it is possible that the mcg might have gone away, and 2940 * gets recreated at this point. 2941 */ 2942 static void 2943 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2944 { 2945 ib_gid_t mgid; 2946 2947 /* 2948 * If the mc_fullreap flag is set, or this join fails, a subsequent 2949 * reap/leave is going to try to leave the group. We could prevent 2950 * that by adding a boolean flag into ibd_mce_t, if required. 2951 */ 2952 if (mce->mc_fullreap) 2953 return; 2954 2955 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2956 2957 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2958 mgid.gid_guid); 2959 2960 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2961 ibd_print_warn(state, "Failure on port up to rejoin " 2962 "multicast gid %016llx:%016llx", 2963 (u_longlong_t)mgid.gid_prefix, 2964 (u_longlong_t)mgid.gid_guid); 2965 } 2966 2967 /* 2968 * This code handles delayed Tx completion cleanups for mcg's to which 2969 * disable_multicast has been issued, regular mcg related cleanups during 2970 * disable_multicast, disable_promiscous and mcg traps, as well as 2971 * cleanups during driver detach time. Depending on the join state, 2972 * it deletes the mce from the appropriate list and issues the IBA 2973 * leave/detach; except in the disable_multicast case when the mce 2974 * is left on the active list for a subsequent Tx completion cleanup. 2975 */ 2976 static void 2977 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2978 uint8_t jstate) 2979 { 2980 ibd_mce_t *tmce; 2981 boolean_t do_detach = B_TRUE; 2982 2983 /* 2984 * Before detaching, we must check whether the other list 2985 * contains the mcg; if we detach blindly, the consumer 2986 * who set up the other list will also stop receiving 2987 * traffic. 2988 */ 2989 if (jstate == IB_MC_JSTATE_FULL) { 2990 /* 2991 * The following check is only relevant while coming 2992 * from the Tx completion path in the reap case. 2993 */ 2994 if (!mce->mc_fullreap) 2995 return; 2996 mutex_enter(&state->id_mc_mutex); 2997 IBD_MCACHE_PULLOUT_FULL(state, mce); 2998 mutex_exit(&state->id_mc_mutex); 2999 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3000 do_detach = B_FALSE; 3001 } else if (jstate == IB_MC_JSTATE_NON) { 3002 IBD_MCACHE_PULLOUT_NON(state, mce); 3003 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3004 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3005 do_detach = B_FALSE; 3006 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3007 mutex_enter(&state->id_mc_mutex); 3008 IBD_MCACHE_PULLOUT_FULL(state, mce); 3009 mutex_exit(&state->id_mc_mutex); 3010 do_detach = B_FALSE; 3011 } 3012 3013 /* 3014 * If we are reacting to a mcg trap and leaving our sendonly or 3015 * non membership, the mcg is possibly already gone, so attempting 3016 * to leave might fail. On the other hand, we must try to leave 3017 * anyway, since this might be a trap from long ago, and we could 3018 * have potentially sendonly joined to a recent incarnation of 3019 * the mcg and are about to loose track of this information. 3020 */ 3021 if (do_detach) { 3022 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3023 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3024 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3025 } 3026 3027 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3028 kmem_free(mce, sizeof (ibd_mce_t)); 3029 } 3030 3031 /* 3032 * Async code executed due to multicast and promiscuous disable requests 3033 * and mcg trap handling; also executed during driver detach. Mostly, a 3034 * leave and detach is done; except for the fullmember case when Tx 3035 * requests are pending, whence arrangements are made for subsequent 3036 * cleanup on Tx completion. 3037 */ 3038 static void 3039 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3040 { 3041 ipoib_mac_t mcmac; 3042 boolean_t recycled; 3043 ibd_mce_t *mce; 3044 3045 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3046 jstate, mgid.gid_prefix, mgid.gid_guid); 3047 3048 if (jstate == IB_MC_JSTATE_NON) { 3049 recycled = B_TRUE; 3050 mce = IBD_MCACHE_FIND_NON(state, mgid); 3051 /* 3052 * In case we are handling a mcg trap, we might not find 3053 * the mcg in the non list. 3054 */ 3055 if (mce == NULL) 3056 return; 3057 } else { 3058 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3059 3060 /* 3061 * In case we are handling a mcg trap, make sure the trap 3062 * is not arriving late; if we have an mce that indicates 3063 * that we are already a fullmember, that would be a clear 3064 * indication that the trap arrived late (ie, is for a 3065 * previous incarnation of the mcg). 3066 */ 3067 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3068 if ((mce == NULL) || (mce->mc_jstate == 3069 IB_MC_JSTATE_FULL)) 3070 return; 3071 ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3072 } else { 3073 ASSERT(jstate == IB_MC_JSTATE_FULL); 3074 ASSERT((mce != NULL) && (mce->mc_jstate == 3075 IB_MC_JSTATE_FULL)); 3076 mce->mc_fullreap = B_TRUE; 3077 } 3078 3079 /* 3080 * If no pending Tx's remain that reference the AH 3081 * for the mcg, recycle it from active to free list. 3082 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3083 * so the last completing Tx will cause an async reap 3084 * operation to be invoked, at which time we will drop our 3085 * membership to the mcg so that the pending Tx's complete 3086 * successfully. Refer to comments on "AH and MCE active 3087 * list manipulation" at top of this file. The lock protects 3088 * against Tx fast path and Tx cleanup code. 3089 */ 3090 mutex_enter(&state->id_ac_mutex); 3091 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3092 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3093 IB_MC_JSTATE_SEND_ONLY_NON)); 3094 mutex_exit(&state->id_ac_mutex); 3095 } 3096 3097 if (recycled) { 3098 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3099 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3100 ibd_async_reap_group(state, mce, mgid, jstate); 3101 } 3102 } 3103 3104 /* 3105 * Find the broadcast address as defined by IPoIB; implicitly 3106 * determines the IBA scope, mtu, tclass etc of the link the 3107 * interface is going to be a member of. 3108 */ 3109 static ibt_status_t 3110 ibd_find_bgroup(ibd_state_t *state) 3111 { 3112 ibt_mcg_attr_t mcg_attr; 3113 uint_t numg; 3114 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3115 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3116 IB_MC_SCOPE_GLOBAL }; 3117 int i, mcgmtu; 3118 boolean_t found = B_FALSE; 3119 3120 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3121 mcg_attr.mc_pkey = state->id_pkey; 3122 state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; 3123 3124 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3125 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3126 3127 /* 3128 * Look for the IPoIB broadcast group. 3129 */ 3130 state->id_mgid.gid_prefix = 3131 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3132 ((uint64_t)state->id_scope << 48) | 3133 ((uint32_t)(state->id_pkey << 16))); 3134 mcg_attr.mc_mgid = state->id_mgid; 3135 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3136 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3137 found = B_TRUE; 3138 break; 3139 } 3140 3141 } 3142 3143 if (!found) { 3144 ibd_print_warn(state, "IPoIB broadcast group absent"); 3145 return (IBT_FAILURE); 3146 } 3147 3148 /* 3149 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3150 */ 3151 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3152 if (state->id_mtu < mcgmtu) { 3153 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3154 "greater than port's maximum MTU %d", mcgmtu, 3155 state->id_mtu); 3156 return (IBT_FAILURE); 3157 } 3158 state->id_mtu = mcgmtu; 3159 3160 return (IBT_SUCCESS); 3161 } 3162 3163 /* 3164 * Post ibt_attach() initialization. 3165 */ 3166 static int 3167 ibd_drv_init(ibd_state_t *state) 3168 { 3169 kthread_t *kht; 3170 ibt_ud_chan_alloc_args_t ud_alloc_attr; 3171 ibt_ud_chan_query_attr_t ud_chan_attr; 3172 ibt_hca_portinfo_t *port_infop; 3173 ibt_hca_attr_t hca_attrs; 3174 ibt_status_t ibt_status; 3175 ibt_cq_attr_t cq_attr; 3176 ib_guid_t hca_guid; 3177 uint32_t real_size; 3178 uint32_t *ptr; 3179 char pathname[OBP_MAXPATHLEN]; 3180 uint_t psize, port_infosz; 3181 3182 /* 3183 * Initialize id_port before ibt_open_hca because of 3184 * ordering requirements in port up/down handling. 3185 */ 3186 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 3187 return (DDI_FAILURE); 3188 3189 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 3190 &state->id_hca_hdl) != IBT_SUCCESS) { 3191 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 3192 return (DDI_FAILURE); 3193 } 3194 3195 mutex_enter(&state->id_link_mutex); 3196 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 3197 state->id_port, &port_infop, &psize, 3198 &port_infosz); 3199 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 3200 mutex_exit(&state->id_link_mutex); 3201 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 3202 (void) ibt_close_hca(state->id_hca_hdl); 3203 return (DDI_FAILURE); 3204 } 3205 3206 /* 3207 * If the link already went down by the time we get here, give up; 3208 * we can not even get the gid since that is not valid. We would 3209 * fail in ibd_find_bgroup() anyway. 3210 */ 3211 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3212 mutex_exit(&state->id_link_mutex); 3213 ibt_free_portinfo(port_infop, port_infosz); 3214 (void) ibt_close_hca(state->id_hca_hdl); 3215 ibd_print_warn(state, "Port is not active"); 3216 return (DDI_FAILURE); 3217 } 3218 3219 /* 3220 * This verifies the Pkey ibnexus handed us is still valid. 3221 * This is also the point from which the pkey table for the 3222 * port must hold the exact pkey value at the exact index 3223 * across port up/downs. 3224 */ 3225 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3226 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3227 mutex_exit(&state->id_link_mutex); 3228 ibt_free_portinfo(port_infop, port_infosz); 3229 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3230 (void) ibt_close_hca(state->id_hca_hdl); 3231 return (DDI_FAILURE); 3232 } 3233 3234 state->id_mtu = (128 << port_infop->p_mtu); 3235 state->id_sgid = *port_infop->p_sgid_tbl; 3236 state->id_link_state = GLD_LINKSTATE_UP; 3237 mutex_exit(&state->id_link_mutex); 3238 3239 ibt_free_portinfo(port_infop, port_infosz); 3240 3241 state->id_link_speed = ibd_get_portspeed(state); 3242 3243 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3244 ASSERT(ibt_status == IBT_SUCCESS); 3245 3246 /* 3247 * We need to determine whether the HCA can support checksum 3248 * and indicate that to higher layers. 3249 */ 3250 if (ibd_csum_send > IBD_CSUM_NONE) 3251 state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL; 3252 3253 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3254 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3255 goto drv_init_fail_find_bgroup; 3256 } 3257 state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE; 3258 3259 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3260 &state->id_pd_hdl) != IBT_SUCCESS) { 3261 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3262 goto drv_init_fail_alloc_pd; 3263 } 3264 3265 /* Initialize the parallel ARP cache and AHs */ 3266 if (ibd_acache_init(state) != DDI_SUCCESS) { 3267 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3268 goto drv_init_fail_acache; 3269 } 3270 3271 /* 3272 * Check various tunable limits. 3273 */ 3274 if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { 3275 ibd_print_warn(state, "Setting #sgl = %d instead of default %d", 3276 hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); 3277 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3278 } else { 3279 state->id_max_sqseg = IBD_MAX_SQSEG; 3280 } 3281 3282 /* 3283 * First, check #r/s wqes against max channel size. 3284 */ 3285 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3286 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3287 else 3288 state->id_num_rwqe = IBD_NUM_RWQE; 3289 3290 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3291 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3292 else 3293 state->id_num_swqe = IBD_NUM_SWQE; 3294 3295 /* 3296 * Allocate Rx/combined CQ: 3297 * Theoretically, there is no point in having more than #rwqe 3298 * plus #swqe cqe's, except that the CQ will be signalled for 3299 * overflow when the last wqe completes, if none of the previous 3300 * cqe's have been polled. Thus, we allocate just a few less wqe's 3301 * to make sure such overflow does not occur. 3302 */ 3303 cq_attr.cq_sched = NULL; 3304 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3305 3306 if (ibd_separate_cqs == 1) { 3307 /* 3308 * Allocate Receive CQ. 3309 */ 3310 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3311 cq_attr.cq_size = state->id_num_rwqe + 1; 3312 } else { 3313 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3314 state->id_num_rwqe = cq_attr.cq_size - 1; 3315 } 3316 3317 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3318 ibd_print_warn(state, "Computed #rwqe %d based on " 3319 "requested size and supportable CQ size is less " 3320 "than the required threshold %d", 3321 state->id_num_rwqe, IBD_RX_THRESHOLD); 3322 goto drv_init_fail_min_rwqes; 3323 } 3324 3325 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3326 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3327 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3328 goto drv_init_fail_alloc_rcq; 3329 } 3330 3331 /* 3332 * Allocate Send CQ. 3333 */ 3334 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3335 cq_attr.cq_size = state->id_num_swqe + 1; 3336 } else { 3337 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3338 state->id_num_swqe = cq_attr.cq_size - 1; 3339 } 3340 3341 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3342 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3343 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3344 goto drv_init_fail_alloc_scq; 3345 } 3346 } else { 3347 /* 3348 * Allocate combined Send/Receive CQ. 3349 */ 3350 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3351 state->id_num_swqe + 1)) { 3352 cq_attr.cq_size = state->id_num_rwqe + 3353 state->id_num_swqe + 1; 3354 } else { 3355 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3356 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3357 state->id_num_rwqe) / (state->id_num_rwqe + 3358 state->id_num_swqe); 3359 state->id_num_swqe = cq_attr.cq_size - 1 - 3360 state->id_num_rwqe; 3361 } 3362 3363 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3364 ibd_print_warn(state, "Computed #rwqe %d based on " 3365 "requested size and supportable CQ size is less " 3366 "than the required threshold %d", 3367 state->id_num_rwqe, IBD_RX_THRESHOLD); 3368 goto drv_init_fail_min_rwqes; 3369 } 3370 3371 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3372 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3373 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3374 goto drv_init_fail_alloc_rcq; 3375 } 3376 state->id_scq_hdl = state->id_rcq_hdl; 3377 } 3378 3379 /* 3380 * Print message in case we could not allocate as many wqe's 3381 * as was requested. Note that in the combined CQ case, we will 3382 * get the following message. 3383 */ 3384 if (state->id_num_rwqe != IBD_NUM_RWQE) 3385 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3386 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3387 if (state->id_num_swqe != IBD_NUM_SWQE) 3388 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3389 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3390 3391 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3392 ud_alloc_attr.ud_hca_port_num = state->id_port; 3393 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3394 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3395 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3396 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3397 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3398 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3399 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3400 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3401 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3402 ud_alloc_attr.ud_clone_chan = NULL; 3403 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3404 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3405 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3406 "\n"); 3407 goto drv_init_fail_alloc_chan; 3408 } 3409 3410 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3411 DDI_SUCCESS) { 3412 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3413 goto drv_init_fail_query_chan; 3414 } 3415 state->id_qpnum = ud_chan_attr.ud_qpn; 3416 3417 /* Initialize the Transmit buffer list */ 3418 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3419 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3420 goto drv_init_fail_txlist_init; 3421 } 3422 3423 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3424 /* Setup the handler we will use for regular DLPI stuff */ 3425 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3426 if (ibt_enable_cq_notify(state->id_scq_hdl, 3427 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3428 DPRINT(10, "ibd_drv_init : failed in" 3429 " ibt_enable_cq_notify()\n"); 3430 goto drv_init_fail_cq_notify; 3431 } 3432 } 3433 3434 /* Create the service fifos before we start receiving */ 3435 if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, 3436 state)) == NULL) { 3437 DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); 3438 goto drv_init_fail_srv_fifo; 3439 } 3440 3441 /* Initialize the Receive buffer list */ 3442 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3443 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3444 goto drv_init_fail_rxlist_init; 3445 } 3446 3447 /* Join to IPoIB broadcast group as required by IPoIB */ 3448 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3449 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3450 goto drv_init_fail_join_group; 3451 } 3452 3453 /* Create the async thread */ 3454 if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3455 TS_RUN, minclsyspri)) == NULL) { 3456 /* Do we have to specially leave the group? */ 3457 DPRINT(10, "ibd_drv_init : failed in thread_create\n"); 3458 goto drv_init_fail_thread_create; 3459 } 3460 state->id_async_thrid = kht->t_did; 3461 3462 /* 3463 * The local mac address is now known. Create the IPoIB 3464 * address. 3465 */ 3466 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3467 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3468 state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr; 3469 3470 /* 3471 * Similarly, program in the broadcast mac address. 3472 */ 3473 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3474 state->id_mgid.gid_guid); 3475 state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr; 3476 3477 ptr = (uint32_t *)&state->id_macaddr; 3478 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3479 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3480 ptr = (uint32_t *)&state->id_bcaddr; 3481 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3482 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3483 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3484 state->id_pkey, state->id_mgid.gid_prefix, 3485 state->id_mgid.gid_guid); 3486 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3487 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3488 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3489 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3490 (void) ddi_pathname(state->id_dip, pathname); 3491 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3492 3493 return (DDI_SUCCESS); 3494 3495 drv_init_fail_thread_create: 3496 ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); 3497 3498 drv_init_fail_join_group: 3499 ibd_fini_rxlist(state); 3500 3501 drv_init_fail_rxlist_init: 3502 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3503 3504 drv_init_fail_srv_fifo: 3505 drv_init_fail_cq_notify: 3506 ibd_fini_txlist(state); 3507 3508 drv_init_fail_txlist_init: 3509 drv_init_fail_query_chan: 3510 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3511 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3512 3513 drv_init_fail_alloc_chan: 3514 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3515 IBT_SUCCESS)) 3516 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3517 3518 drv_init_fail_alloc_scq: 3519 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3520 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3521 3522 drv_init_fail_min_rwqes: 3523 drv_init_fail_alloc_rcq: 3524 ibd_acache_fini(state); 3525 drv_init_fail_acache: 3526 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3527 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3528 3529 drv_init_fail_alloc_pd: 3530 ibt_free_mcg_info(state->id_mcinfo, 1); 3531 drv_init_fail_find_bgroup: 3532 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3533 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3534 3535 return (DDI_FAILURE); 3536 } 3537 3538 /* 3539 * Allocate the statically allocated Tx buffer list. 3540 */ 3541 static int 3542 ibd_init_txlist(ibd_state_t *state) 3543 { 3544 ibd_swqe_t *swqe; 3545 int i; 3546 3547 for (i = 0; i < state->id_num_swqe; i++) { 3548 if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { 3549 DPRINT(10, "ibd_init_txlist : failed in " 3550 "ibd_alloc_swqe()\n"); 3551 ibd_fini_txlist(state); 3552 return (DDI_FAILURE); 3553 } 3554 3555 /* add to list */ 3556 state->id_tx_list.dl_cnt++; 3557 if (state->id_tx_list.dl_head == NULL) { 3558 swqe->swqe_prev = NULL; 3559 swqe->swqe_next = NULL; 3560 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3561 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3562 } else { 3563 swqe->swqe_prev = state->id_tx_list.dl_tail; 3564 swqe->swqe_next = NULL; 3565 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3566 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3567 } 3568 } 3569 3570 return (DDI_SUCCESS); 3571 } 3572 3573 /* 3574 * Free the statically allocated Tx buffer list. 3575 */ 3576 static void 3577 ibd_fini_txlist(ibd_state_t *state) 3578 { 3579 ibd_swqe_t *node; 3580 3581 mutex_enter(&state->id_tx_list.dl_mutex); 3582 while (state->id_tx_list.dl_head != NULL) { 3583 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3584 state->id_tx_list.dl_head = node->swqe_next; 3585 state->id_tx_list.dl_cnt--; 3586 ASSERT(state->id_tx_list.dl_cnt >= 0); 3587 ibd_free_swqe(state, node); 3588 } 3589 mutex_exit(&state->id_tx_list.dl_mutex); 3590 } 3591 3592 /* 3593 * Allocate a single send wqe and register it so it is almost 3594 * ready to be posted to the hardware. 3595 */ 3596 static int 3597 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) 3598 { 3599 ibt_mr_attr_t mem_attr; 3600 ibd_swqe_t *swqe; 3601 3602 swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); 3603 *wqe = swqe; 3604 swqe->swqe_type = IBD_WQE_SEND; 3605 swqe->swqe_next = NULL; 3606 swqe->swqe_prev = NULL; 3607 swqe->swqe_im_mblk = NULL; 3608 swqe->w_mdtinfo = NULL; 3609 3610 /* alloc copy buffer, must be max size to handle multiple mblk case */ 3611 swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); 3612 3613 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3614 mem_attr.mr_len = state->id_mtu; 3615 mem_attr.mr_as = NULL; 3616 mem_attr.mr_flags = IBT_MR_SLEEP; 3617 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3618 &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != 3619 IBT_SUCCESS) { 3620 DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); 3621 kmem_free(swqe->swqe_copybuf.ic_bufaddr, 3622 state->id_mtu); 3623 kmem_free(swqe, sizeof (ibd_swqe_t)); 3624 return (DDI_FAILURE); 3625 } 3626 3627 swqe->swqe_copybuf.ic_sgl.ds_va = 3628 (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3629 swqe->swqe_copybuf.ic_sgl.ds_key = 3630 swqe->swqe_copybuf.ic_mr_desc.md_lkey; 3631 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3632 3633 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3634 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3635 swqe->w_swr.wr_trans = IBT_UD_SRV; 3636 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3637 3638 /* These are set in send */ 3639 swqe->w_swr.wr_nds = 0; 3640 swqe->w_swr.wr_sgl = NULL; 3641 3642 return (DDI_SUCCESS); 3643 } 3644 3645 /* 3646 * Free an allocated send wqe. 3647 */ 3648 static void 3649 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3650 { 3651 3652 if (ibt_deregister_mr(state->id_hca_hdl, 3653 swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3654 DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); 3655 return; 3656 } 3657 kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); 3658 kmem_free(swqe, sizeof (ibd_swqe_t)); 3659 } 3660 3661 /* 3662 * Post a rwqe to the hardware and add it to the Rx list. The 3663 * "recycle" parameter indicates whether an old rwqe is being 3664 * recycled, or this is a new one. 3665 */ 3666 static int 3667 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3668 { 3669 if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != 3670 IBT_SUCCESS) { 3671 DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); 3672 return (DDI_FAILURE); 3673 } 3674 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3675 3676 /* 3677 * Buffers being recycled are already in the list. 3678 */ 3679 if (recycle) 3680 return (DDI_SUCCESS); 3681 3682 mutex_enter(&state->id_rx_list.dl_mutex); 3683 if (state->id_rx_list.dl_head == NULL) { 3684 rwqe->rwqe_prev = NULL; 3685 rwqe->rwqe_next = NULL; 3686 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3687 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3688 } else { 3689 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3690 rwqe->rwqe_next = NULL; 3691 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3692 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3693 } 3694 mutex_exit(&state->id_rx_list.dl_mutex); 3695 3696 return (DDI_SUCCESS); 3697 } 3698 3699 /* 3700 * Allocate the statically allocated Rx buffer list. 3701 */ 3702 static int 3703 ibd_init_rxlist(ibd_state_t *state) 3704 { 3705 ibd_rwqe_t *rwqe; 3706 int i; 3707 3708 for (i = 0; i < state->id_num_rwqe; i++) { 3709 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3710 ibd_fini_rxlist(state); 3711 return (DDI_FAILURE); 3712 } 3713 3714 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3715 ibd_free_rwqe(state, rwqe); 3716 ibd_fini_rxlist(state); 3717 return (DDI_FAILURE); 3718 } 3719 } 3720 3721 return (DDI_SUCCESS); 3722 } 3723 3724 /* 3725 * Free the statically allocated Rx buffer list. 3726 * 3727 */ 3728 static void 3729 ibd_fini_rxlist(ibd_state_t *state) 3730 { 3731 ibd_rwqe_t *node; 3732 3733 mutex_enter(&state->id_rx_list.dl_mutex); 3734 while (state->id_rx_list.dl_head != NULL) { 3735 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3736 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3737 state->id_rx_list.dl_cnt--; 3738 ASSERT(state->id_rx_list.dl_cnt >= 0); 3739 3740 ibd_free_rwqe(state, node); 3741 } 3742 mutex_exit(&state->id_rx_list.dl_mutex); 3743 } 3744 3745 /* 3746 * Allocate a single recv wqe and register it so it is almost 3747 * ready to be posted to the hardware. 3748 */ 3749 static int 3750 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3751 { 3752 ibt_mr_attr_t mem_attr; 3753 ibd_rwqe_t *rwqe; 3754 3755 if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3756 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3757 return (DDI_FAILURE); 3758 } 3759 *wqe = rwqe; 3760 rwqe->rwqe_type = IBD_WQE_RECV; 3761 rwqe->w_state = state; 3762 rwqe->rwqe_next = NULL; 3763 rwqe->rwqe_prev = NULL; 3764 rwqe->w_freeing_wqe = B_FALSE; 3765 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3766 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3767 3768 if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3769 IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { 3770 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); 3771 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3772 return (DDI_FAILURE); 3773 } 3774 3775 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3776 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3777 NULL) { 3778 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3779 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3780 state->id_mtu + IPOIB_GRH_SIZE); 3781 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3782 return (DDI_FAILURE); 3783 } 3784 3785 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3786 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3787 mem_attr.mr_as = NULL; 3788 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3789 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3790 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3791 IBT_SUCCESS) { 3792 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3793 rwqe->w_freeing_wqe = B_TRUE; 3794 freemsg(rwqe->rwqe_im_mblk); 3795 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3796 state->id_mtu + IPOIB_GRH_SIZE); 3797 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3798 return (DDI_FAILURE); 3799 } 3800 3801 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3802 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3803 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3804 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3805 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3806 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3807 rwqe->w_rwr.wr_nds = 1; 3808 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3809 3810 return (DDI_SUCCESS); 3811 } 3812 3813 /* 3814 * Free an allocated recv wqe. 3815 */ 3816 static void 3817 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3818 { 3819 3820 if (ibt_deregister_mr(state->id_hca_hdl, 3821 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3822 DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); 3823 return; 3824 } 3825 3826 /* 3827 * Indicate to the callback function that this rwqe/mblk 3828 * should not be recycled. The freemsg() will invoke 3829 * ibd_freemsg_cb(). 3830 */ 3831 if (rwqe->rwqe_im_mblk != NULL) { 3832 rwqe->w_freeing_wqe = B_TRUE; 3833 freemsg(rwqe->rwqe_im_mblk); 3834 } 3835 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3836 state->id_mtu + IPOIB_GRH_SIZE); 3837 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3838 } 3839 3840 /* 3841 * Delete the rwqe being freed from the rx list. 3842 */ 3843 static void 3844 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3845 { 3846 mutex_enter(&state->id_rx_list.dl_mutex); 3847 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3848 state->id_rx_list.dl_head = rwqe->rwqe_next; 3849 else 3850 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3851 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3852 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3853 else 3854 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3855 mutex_exit(&state->id_rx_list.dl_mutex); 3856 } 3857 3858 /* 3859 * Pre ibt_detach() deconstruction. 3860 */ 3861 static void 3862 ibd_drv_fini(ibd_state_t *state) 3863 { 3864 ib_gid_t mgid; 3865 ibd_mce_t *mce; 3866 ibt_status_t status; 3867 uint8_t jstate; 3868 3869 /* 3870 * Desubscribe from trap notices; we will be tearing down 3871 * the mcg lists soon. Make sure the trap handler does nothing 3872 * even if it is invoked (ie till we invoke ibt_detach()). 3873 */ 3874 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 3875 mutex_enter(&state->id_trap_lock); 3876 state->id_trap_stop = B_TRUE; 3877 while (state->id_trap_inprog > 0) 3878 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 3879 mutex_exit(&state->id_trap_lock); 3880 3881 /* 3882 * Flushing the channel ensures that all pending WQE's 3883 * are marked with flush_error and handed to the CQ. It 3884 * does not guarantee the invocation of the CQ handler. 3885 * This call is guaranteed to return successfully for UD QPNs. 3886 */ 3887 status = ibt_flush_channel(state->id_chnl_hdl); 3888 ASSERT(status == IBT_SUCCESS); 3889 3890 /* 3891 * We possibly need a loop here to wait for all the Tx 3892 * callbacks to happen. The Tx handlers will retrieve 3893 * held resources like AH ac_ref count, registered memory 3894 * and possibly ASYNC_REAP requests. Rx interrupts were already 3895 * turned off (in ibd_detach()); turn off Tx interrupts and 3896 * poll. By the time the polling returns an empty indicator, 3897 * we are sure we have seen all pending Tx callbacks. Note 3898 * that after the ibt_set_cq_handler() returns, the old handler 3899 * is guaranteed not to be invoked anymore. 3900 */ 3901 if (ibd_separate_cqs == 1) 3902 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 3903 ibd_poll_compq(state, state->id_scq_hdl); 3904 3905 /* 3906 * No more async requests will be posted since the device has been 3907 * unregistered; completion handlers have been turned off, so Tx 3908 * handler will not cause any more ASYNC_REAP requests. Queue a 3909 * request for the async thread to exit, which will be serviced 3910 * after any pending ones. This can take a while, specially if the 3911 * SM is unreachable, since IBMF will slowly timeout each SM request 3912 * issued by the async thread. Reap the thread before continuing on, 3913 * we do not want it to be lingering in modunloaded code. 3914 */ 3915 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); 3916 thread_join(state->id_async_thrid); 3917 3918 /* 3919 * We can not be in promiscuous mode anymore, upper layers 3920 * would have made a request to disable it (if ever set previously) 3921 * before the detach is allowed to progress to this point; and the 3922 * aysnc thread would have processed that request by now. Thus the 3923 * nonmember list is guaranteed empty at this point. 3924 */ 3925 ASSERT(state->id_prom_op != COMPLETED); 3926 3927 /* 3928 * Drop all residual full/non membership. This includes full 3929 * membership to the broadcast group, and any nonmembership 3930 * acquired during transmits. We do this after the Tx completion 3931 * handlers are done, since those might result in some late 3932 * leaves; this also eliminates a potential race with that 3933 * path wrt the mc full list insert/delete. Trap handling 3934 * has also been suppressed at this point. Thus, no locks 3935 * are required while traversing the mc full list. 3936 */ 3937 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 3938 mce = list_head(&state->id_mc_full); 3939 while (mce != NULL) { 3940 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3941 jstate = mce->mc_jstate; 3942 mce = list_next(&state->id_mc_full, mce); 3943 ibd_leave_group(state, mgid, jstate); 3944 } 3945 3946 ibt_free_mcg_info(state->id_mcinfo, 1); 3947 3948 /* 3949 * Kill the channel now; guaranteed to return successfully 3950 * for UD QPNs. 3951 */ 3952 status = ibt_free_channel(state->id_chnl_hdl); 3953 ASSERT(status == IBT_SUCCESS); 3954 3955 /* 3956 * Kill the CQ; all completion handlers are guaranteed to 3957 * have terminated by the time this returns. Since we killed 3958 * the QPN above, we can not receive the IBT_CQ_BUSY error. 3959 */ 3960 status = ibt_free_cq(state->id_rcq_hdl); 3961 ASSERT(status == IBT_SUCCESS); 3962 3963 if (ibd_separate_cqs == 1) { 3964 status = ibt_free_cq(state->id_scq_hdl); 3965 ASSERT(status == IBT_SUCCESS); 3966 } 3967 3968 /* 3969 * We killed the receive interrupts, thus, we will not be 3970 * required to handle received packets anymore. Thus, kill 3971 * service threads since they are not going to be used anymore. 3972 */ 3973 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3974 3975 /* 3976 * Since these following will act on the Rx/Tx list, which 3977 * is also looked at by the Rx/Tx handlers, keep them around 3978 * till all handlers are guaranteed to have completed. 3979 */ 3980 ibd_fini_rxlist(state); 3981 ibd_fini_txlist(state); 3982 3983 /* 3984 * Clean up the active AH hash list. 3985 */ 3986 mod_hash_destroy_hash(state->id_ah_active_hash); 3987 3988 /* 3989 * Free parallel ARP cache and AHs; we are sure all of these 3990 * resources have been released by the Tx completion handler. 3991 */ 3992 ibd_acache_fini(state); 3993 3994 /* 3995 * We freed the QPN, all the MRs and AHs. This step should not 3996 * fail; print a warning message if it does fail, due to a bug 3997 * in the driver. 3998 */ 3999 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 4000 ibd_print_warn(state, "failed to free protection domain"); 4001 4002 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 4003 ibd_print_warn(state, "failed to close HCA device"); 4004 } 4005 4006 /* 4007 * IBA Rx/Tx completion queue handler. Guaranteed to be single 4008 * threaded and nonreentrant for this CQ. When using combined CQ, 4009 * this handles Tx and Rx completions. With separate CQs, this handles 4010 * only Rx completions. 4011 */ 4012 /* ARGSUSED */ 4013 static void 4014 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4015 { 4016 ibd_state_t *state = (ibd_state_t *)arg; 4017 4018 atomic_add_64(&state->id_num_intrs, 1); 4019 (void) gld_intr(state->id_macinfo); 4020 } 4021 4022 /* 4023 * Separate CQ handler for Tx completions, when the Tx CQ is in 4024 * interrupt driven mode. 4025 */ 4026 /* ARGSUSED */ 4027 static void 4028 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4029 { 4030 ibd_state_t *state = (ibd_state_t *)arg; 4031 4032 atomic_add_64(&state->id_num_intrs, 1); 4033 4034 /* 4035 * Poll for completed entries; the CQ will not interrupt any 4036 * more for completed packets. 4037 */ 4038 ibd_poll_compq(state, state->id_scq_hdl); 4039 4040 /* 4041 * Now enable CQ notifications; all completions originating now 4042 * will cause new interrupts. 4043 */ 4044 if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != 4045 IBT_SUCCESS) { 4046 /* 4047 * We do not expect a failure here. 4048 */ 4049 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 4050 } 4051 4052 /* 4053 * Repoll to catch all packets that might have completed after 4054 * we finished the first poll loop and before interrupts got 4055 * armed. 4056 */ 4057 ibd_poll_compq(state, state->id_scq_hdl); 4058 } 4059 4060 /* 4061 * Multicast group create/delete trap handler. These will be delivered 4062 * on a kernel thread (handling can thus block) and can be invoked 4063 * concurrently. The handler can be invoked anytime after it is 4064 * registered and before ibt_detach(). 4065 */ 4066 /* ARGSUSED */ 4067 static void 4068 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4069 ibt_subnet_event_t *event) 4070 { 4071 ibd_state_t *state = (ibd_state_t *)arg; 4072 ibd_req_t *req; 4073 4074 /* 4075 * The trap handler will get invoked once for every event for 4076 * evert port. The input "gid" is the GID0 of the port the 4077 * trap came in on; we just need to act on traps that came 4078 * to our port, meaning the port on which the ipoib interface 4079 * resides. Since ipoib uses GID0 of the port, we just match 4080 * the gids to check whether we need to handle the trap. 4081 */ 4082 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4083 return; 4084 4085 DPRINT(10, "ibd_notices_handler : %d\n", code); 4086 4087 switch (code) { 4088 case IBT_SM_EVENT_UNAVAILABLE: 4089 /* 4090 * If we are in promiscuous mode or have 4091 * sendnonmembers, we need to print a warning 4092 * message right now. Else, just store the 4093 * information, print when we enter promiscuous 4094 * mode or attempt nonmember send. We might 4095 * also want to stop caching sendnonmember. 4096 */ 4097 ibd_print_warn(state, "IBA multicast support " 4098 "degraded due to unavailability of multicast " 4099 "traps"); 4100 break; 4101 case IBT_SM_EVENT_AVAILABLE: 4102 /* 4103 * If we printed a warning message above or 4104 * while trying to nonmember send or get into 4105 * promiscuous mode, print an okay message. 4106 */ 4107 ibd_print_warn(state, "IBA multicast support " 4108 "restored due to availability of multicast " 4109 "traps"); 4110 break; 4111 case IBT_SM_EVENT_MCG_CREATED: 4112 case IBT_SM_EVENT_MCG_DELETED: 4113 /* 4114 * Common processing of creation/deletion traps. 4115 * First check if the instance is being 4116 * [de]initialized; back off then, without doing 4117 * anything more, since we are not sure if the 4118 * async thread is around, or whether we might 4119 * be racing with the detach code in ibd_drv_fini() 4120 * that scans the mcg list. 4121 */ 4122 if (!ibd_async_safe(state)) 4123 return; 4124 4125 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 4126 req->rq_gid = event->sm_notice_gid; 4127 req->rq_ptr = (void *)code; 4128 ibd_queue_work_slot(state, req, ASYNC_TRAP); 4129 break; 4130 } 4131 } 4132 4133 static void 4134 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4135 { 4136 ib_gid_t mgid = req->rq_gid; 4137 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4138 4139 DPRINT(10, "ibd_async_trap : %d\n", code); 4140 4141 /* 4142 * Atomically search the nonmember and sendonlymember lists and 4143 * delete. 4144 */ 4145 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4146 4147 if (state->id_prom_op == COMPLETED) { 4148 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4149 4150 /* 4151 * If in promiscuous mode, try to join/attach to the new 4152 * mcg. Given the unreliable out-of-order mode of trap 4153 * delivery, we can never be sure whether it is a problem 4154 * if the join fails. Thus, we warn the admin of a failure 4155 * if this was a creation trap. Note that the trap might 4156 * actually be reporting a long past event, and the mcg 4157 * might already have been deleted, thus we might be warning 4158 * in vain. 4159 */ 4160 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4161 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4162 ibd_print_warn(state, "IBA promiscuous mode missed " 4163 "new multicast gid %016llx:%016llx", 4164 (u_longlong_t)mgid.gid_prefix, 4165 (u_longlong_t)mgid.gid_guid); 4166 } 4167 4168 /* 4169 * Free the request slot allocated by the subnet event thread. 4170 */ 4171 kmem_free(req, sizeof (ibd_req_t)); 4172 4173 ibd_async_done(state); 4174 } 4175 4176 /* 4177 * GLD entry point to reset hardware. 4178 */ 4179 /* ARGSUSED */ 4180 static int 4181 ibd_reset(gld_mac_info_t *macinfo) 4182 { 4183 /* 4184 * This will be invoked from Style 1 open() and Style 2 4185 * attach() routines, ie just before the interface starts 4186 * getting used. 4187 */ 4188 return (GLD_SUCCESS); 4189 } 4190 4191 /* 4192 * GLD entry point to start hardware. 4193 */ 4194 /* ARGSUSED */ 4195 static int 4196 ibd_start(gld_mac_info_t *macinfo) 4197 { 4198 return (GLD_SUCCESS); 4199 } 4200 4201 /* 4202 * GLD entry point to stop hardware from receiving packets. 4203 */ 4204 /* ARGSUSED */ 4205 static int 4206 ibd_stop(gld_mac_info_t *macinfo) 4207 { 4208 #ifdef RUN_PERFORMANCE 4209 ibd_perf((ibd_state_t *)macinfo->gldm_private); 4210 #endif 4211 return (GLD_SUCCESS); 4212 } 4213 4214 /* 4215 * GLD entry point to modify device's mac address. We do not 4216 * allow address modifications. 4217 */ 4218 static int 4219 ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr) 4220 { 4221 ibd_state_t *state; 4222 4223 state = (ibd_state_t *)macinfo->gldm_private; 4224 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4225 return (GLD_SUCCESS); 4226 else 4227 return (GLD_FAILURE); 4228 } 4229 4230 /* 4231 * The blocking part of the IBA join/leave operations are done out 4232 * of here on the async thread. 4233 */ 4234 static void 4235 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4236 { 4237 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4238 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4239 4240 if (op == ASYNC_JOIN) { 4241 int ret = ERRORED; 4242 4243 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL) 4244 ret = COMPLETED; 4245 4246 state->id_multi_op = ret; 4247 } else { 4248 /* 4249 * Here, we must search for the proper mcg_info and 4250 * use that to leave the group. 4251 */ 4252 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4253 } 4254 } 4255 4256 /* 4257 * GLD entry point for multicast enable/disable requests. 4258 * Invoked by GLD only on the first multicast enable for a specific 4259 * address (GLD is free to retry ocassionally if we return RETRY), 4260 * and on last disable of the same address. Just queue the operation 4261 * to the async thread. 4262 */ 4263 static int 4264 ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) 4265 { 4266 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4267 ipoib_mac_t *mcast; 4268 ib_gid_t mgid; 4269 ib_qpn_t mcqpn; 4270 int ret; 4271 4272 /* 4273 * The incoming multicast address might not be aligned properly 4274 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4275 * it to look like one though, to get the offsets of the mc gid, 4276 * since we know we are not going to dereference any values with 4277 * the ipoib_mac_t pointer. 4278 */ 4279 mcast = (ipoib_mac_t *)mcmac; 4280 4281 /* 4282 * Check validity of MCG address. We could additionally check 4283 * that a enable/disable is not being issued on the "broadcast" 4284 * mcg, but since this operation is only invokable by priviledged 4285 * programs anyway, we allow the flexibility to those dlpi apps. 4286 * Note that we do not validate the "scope" of the IBA mcg. 4287 */ 4288 bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t)); 4289 if (mcqpn != htonl(IB_MC_QPN)) 4290 return (GLD_FAILURE); 4291 4292 /* 4293 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4294 * nothing (ie we stay JOINed to the broadcast group done in 4295 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4296 * requires to be joined to broadcast groups at all times. 4297 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4298 * depends on this. 4299 */ 4300 if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr, 4301 IPOIB_ADDRL) == 0) 4302 return (GLD_SUCCESS); 4303 4304 ibd_n2h_gid(mcast, &mgid); 4305 4306 if (op == GLD_MULTI_ENABLE) { 4307 DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n", 4308 mgid.gid_prefix, mgid.gid_guid); 4309 ret = GLD_RETRY; 4310 mutex_enter(&state->id_mc_mutex); 4311 if (state->id_multi_op == NOTSTARTED) { 4312 state->id_multi_req.rq_gid = mgid; 4313 ibd_queue_work_slot(state, &state->id_multi_req, 4314 ASYNC_JOIN); 4315 state->id_multi_op = ONGOING; 4316 bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL); 4317 } else if (bcmp(&state->id_multi_addr, mcast, 4318 IPOIB_ADDRL) == 0) { 4319 if (state->id_multi_op != ONGOING) { 4320 if (state->id_multi_op == COMPLETED) 4321 ret = GLD_SUCCESS; 4322 else if (state->id_multi_op == ERRORED) 4323 ret = GLD_FAILURE; 4324 if (state->id_multi_queued) { 4325 state->id_multi_queued = B_FALSE; 4326 ibd_queue_work_slot(state, 4327 &state->id_multi_req, ASYNC_POKE); 4328 } else { 4329 state->id_multi_op = NOTSTARTED; 4330 } 4331 } 4332 } else { 4333 /* 4334 * Hmmm, a set was tried on another mcg. We 4335 * need to make sure to gld_sched for this 4336 * stream to retry once the ongoing one terminates. 4337 * The gld_sched out of the async thread on completion 4338 * of the mcg join is not enough; because the queued 4339 * stream might come in and get a RETRY again because 4340 * the mcg join result has still not been reaped by 4341 * the originator. If gld_sched ensured that streams 4342 * get tried in the order they received RETRYs, things 4343 * would be simpler. 4344 */ 4345 state->id_multi_queued = B_TRUE; 4346 } 4347 mutex_exit(&state->id_mc_mutex); 4348 } else { 4349 ibd_mce_t *mce; 4350 DPRINT(1, "ibd_set_multicast : unset_multicast : " 4351 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4352 ret = GLD_SUCCESS; 4353 mutex_enter(&state->id_mc_mutex); 4354 mce = IBD_MCACHE_FIND_FULL(state, mgid); 4355 mutex_exit(&state->id_mc_mutex); 4356 /* 4357 * GLD should not have invoked us unless the mcg was 4358 * added in the past. 4359 */ 4360 ASSERT(mce != NULL); 4361 ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0); 4362 ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE); 4363 } 4364 return (ret); 4365 } 4366 4367 /* 4368 * The blocking part of the IBA promiscuous operations are done 4369 * out of here on the async thread. The dlpireq parameter indicates 4370 * whether this invocation is due to a dlpi request or due to 4371 * a port up/down event. 4372 */ 4373 static void 4374 ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq) 4375 { 4376 ibd_mce_t *mce = list_head(&state->id_mc_non); 4377 ib_gid_t mgid; 4378 4379 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4380 4381 /* 4382 * Mark the request slot as empty and reusable for the 4383 * next promiscuous set request. 4384 */ 4385 if (dlpireq) 4386 state->id_prom_op = NOTSTARTED; 4387 4388 while (mce != NULL) { 4389 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4390 mce = list_next(&state->id_mc_non, mce); 4391 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4392 } 4393 } 4394 4395 /* 4396 * The blocking part of the IBA promiscuous operations are done 4397 * out of here on the async thread. The dlpireq parameter indicates 4398 * whether this invocation is due to a dlpi request or due to 4399 * a port up/down event. 4400 */ 4401 static void 4402 ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) 4403 { 4404 ibt_mcg_attr_t mcg_attr; 4405 ibt_mcg_info_t *mcg_info; 4406 ib_gid_t mgid; 4407 uint_t numg; 4408 int i; 4409 4410 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4411 4412 /* 4413 * Obtain all active MC groups on the IB fabric with 4414 * specified criteria (scope + Pkey + Qkey + mtu). 4415 */ 4416 bzero(&mcg_attr, sizeof (mcg_attr)); 4417 mcg_attr.mc_pkey = state->id_pkey; 4418 mcg_attr.mc_scope = state->id_scope; 4419 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4420 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4421 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4422 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4423 IBT_SUCCESS) { 4424 ibd_print_warn(state, "Could not get list of IBA multicast " 4425 "groups"); 4426 if (dlpireq) 4427 state->id_prom_op = ERRORED; 4428 return; 4429 } 4430 4431 /* 4432 * Iterate over the returned mcg's and join as NonMember 4433 * to the IP mcg's. 4434 */ 4435 for (i = 0; i < numg; i++) { 4436 /* 4437 * Do a NonMember JOIN on the MC group. 4438 */ 4439 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4440 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4441 ibd_print_warn(state, "IBA promiscuous mode missed " 4442 "multicast gid %016llx:%016llx", 4443 (u_longlong_t)mgid.gid_prefix, 4444 (u_longlong_t)mgid.gid_guid); 4445 } 4446 4447 ibt_free_mcg_info(mcg_info, numg); 4448 if (dlpireq) 4449 state->id_prom_op = COMPLETED; 4450 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4451 } 4452 4453 /* 4454 * GLD entry point for multicast promiscuous enable/disable requests. 4455 * GLD assumes phys state receives more packets than multi state, 4456 * which is not true for IPoIB. Thus, treat the multi and phys 4457 * promiscuous states the same way to work with GLD's assumption. 4458 */ 4459 static int 4460 ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode) 4461 { 4462 ibd_state_t *state; 4463 int ret; 4464 4465 state = (ibd_state_t *)macinfo->gldm_private; 4466 switch (mode) { 4467 case GLD_MAC_PROMISC_PHYS: 4468 case GLD_MAC_PROMISC_MULTI: 4469 DPRINT(1, "ibd_set_promiscuous : set_promisc : %d", 4470 mode); 4471 /* 4472 * Look at gld: this might be getting 4473 * called because someone is turning off 4474 * prom_phys. Nothing needs to be done in 4475 * that case. 4476 */ 4477 ret = GLD_RETRY; 4478 mutex_enter(&state->id_mc_mutex); 4479 switch (state->id_prom_op) { 4480 case NOTSTARTED: 4481 ibd_queue_work_slot(state, 4482 &state->id_prom_req, ASYNC_PROMON); 4483 state->id_prom_op = ONGOING; 4484 break; 4485 case COMPLETED: 4486 ret = GLD_SUCCESS; 4487 break; 4488 case ERRORED: 4489 state->id_prom_op = NOTSTARTED; 4490 ret = GLD_FAILURE; 4491 } 4492 /* 4493 * Else in the ONGOING case, nothing special 4494 * needs to be done; the async thread will poke 4495 * all streams. A prior set, or the last unset 4496 * request is still in the async queue. 4497 */ 4498 mutex_exit(&state->id_mc_mutex); 4499 return (ret); 4500 case GLD_MAC_PROMISC_NONE: 4501 DPRINT(1, "ibd_set_promiscuous : unset_promisc"); 4502 /* 4503 * Look at gld: this might be getting 4504 * called because someone is turning off 4505 * prom_phys or prom_multi. Mark operation 4506 * as ongoing, to prevent a subsequent set 4507 * operation from using the request slot 4508 * unless the async thread is ready to give 4509 * it up. The async thread will mark the 4510 * request slot as usable as soon as it 4511 * starts doing the unset operation. 4512 */ 4513 ASSERT(state->id_prom_op == COMPLETED); 4514 state->id_prom_op = ONGOING; 4515 ibd_queue_work_slot(state, &state->id_prom_req, 4516 ASYNC_PROMOFF); 4517 return (GLD_SUCCESS); 4518 default: 4519 return (GLD_NOTSUPPORTED); 4520 } 4521 } 4522 4523 /* 4524 * GLD entry point for gathering statistics. 4525 */ 4526 static int 4527 ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp) 4528 { 4529 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4530 4531 sp->glds_errrcv = 0; 4532 sp->glds_underflow = 0; 4533 sp->glds_missed = 0; 4534 4535 sp->glds_overflow = state->id_tx_short; /* Tx overflow */ 4536 sp->glds_speed = state->id_link_speed; 4537 sp->glds_media = GLDM_IB; 4538 sp->glds_errxmt = state->id_ah_error; /* failed AH translation */ 4539 sp->glds_norcvbuf = state->id_rx_short; /* # times below water mark */ 4540 sp->glds_intr = state->id_num_intrs; /* number of intrs */ 4541 4542 return (GLD_SUCCESS); 4543 } 4544 4545 /* 4546 * Arrange for a Tx request that is failing, or has already failed due to 4547 * Tx descriptor shortage to be retried soon. Used mostly with poll based 4548 * Tx completion, since gld_sched() can not be invoked in ibd_send() context 4549 * due to potential single processor deadlock (when the ibd_send() is 4550 * caused by gld_recv()). 4551 */ 4552 static void 4553 ibd_tx_sched(ibd_state_t *state) 4554 { 4555 mutex_enter(&state->id_sched_lock); 4556 /* 4557 * If a sched request is already enqueued, do not try to do 4558 * that again, since the async work request list would get 4559 * corrupted. 4560 */ 4561 if (!state->id_sched_queued) { 4562 state->id_sched_queued = B_TRUE; 4563 ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED); 4564 } 4565 mutex_exit(&state->id_sched_lock); 4566 } 4567 4568 /* 4569 * The gld_sched() in ibd_async_work() does the work for us. 4570 */ 4571 static void 4572 ibd_async_txsched(ibd_state_t *state) 4573 { 4574 mutex_enter(&state->id_sched_lock); 4575 state->id_sched_queued = B_FALSE; 4576 mutex_exit(&state->id_sched_lock); 4577 } 4578 4579 /* 4580 * Release one or more chained send wqes back into free list. 4581 */ 4582 static void 4583 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe, 4584 boolean_t send_context) 4585 { 4586 boolean_t call_gld_sched = B_FALSE; 4587 4588 /* 4589 * Add back on Tx list for reuse. 4590 */ 4591 lswqe->swqe_next = NULL; 4592 mutex_enter(&state->id_tx_list.dl_mutex); 4593 if (state->id_tx_list.dl_pending_sends) { 4594 state->id_tx_list.dl_pending_sends = B_FALSE; 4595 call_gld_sched = B_TRUE; 4596 } 4597 if (state->id_tx_list.dl_head == NULL) { 4598 state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe); 4599 } else { 4600 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe); 4601 } 4602 state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe); 4603 mutex_exit(&state->id_tx_list.dl_mutex); 4604 4605 /* 4606 * See comments in ibd_tx_sched(); make sure not to call 4607 * gld_sched() if we are in ibd_send() context. 4608 */ 4609 if (call_gld_sched) 4610 if ((ibd_txcomp_poll == 0) && (!send_context)) 4611 gld_sched(state->id_macinfo); 4612 else 4613 ibd_tx_sched(state); 4614 } 4615 4616 /* 4617 * Acquire a number of chained send wqe's from the free list. Returns the 4618 * number of wqe's actually allocated, and pointers to the first and last 4619 * in the chain. 4620 */ 4621 static int 4622 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, 4623 int number) 4624 { 4625 int numwqe = number; 4626 ibd_swqe_t *node, *wqes; 4627 4628 /* 4629 * Check and reclaim some of the completed Tx requests. 4630 * If someone else is already in this code and pulling Tx 4631 * completions, no need to poll, since the current lock holder 4632 * will do the work anyway. Normally, we poll for completions 4633 * every few Tx attempts, but if we are short on Tx descriptors, 4634 * we always try to poll. 4635 */ 4636 if ((ibd_txcomp_poll == 1) && 4637 (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) == 4638 0) || state->id_tx_list.dl_pending_sends) && 4639 (mutex_tryenter(&state->id_txcomp_lock) != 0)) { 4640 DPRINT(10, "ibd_send : polling"); 4641 ibd_poll_compq(state, state->id_scq_hdl); 4642 mutex_exit(&state->id_txcomp_lock); 4643 } 4644 4645 /* 4646 * Grab required transmit wqes. 4647 */ 4648 mutex_enter(&state->id_tx_list.dl_mutex); 4649 node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head); 4650 while ((node != NULL) && (numwqe-- > 1)) 4651 node = WQE_TO_SWQE(node->swqe_next); 4652 4653 /* 4654 * If we did not find the number we were looking for, flag no resource. 4655 * Adjust list appropriately in either case. 4656 */ 4657 if (numwqe != 0) { 4658 state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL; 4659 state->id_tx_list.dl_pending_sends = B_TRUE; 4660 mutex_exit(&state->id_tx_list.dl_mutex); 4661 DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); 4662 atomic_add_64(&state->id_tx_short, 1); 4663 if (ibd_txcomp_poll == 1) { 4664 /* 4665 * Arrange for a future gld_sched(). Note that when 4666 * the Tx is retried after a little bit, it will 4667 * surely poll the completion queue above. 4668 */ 4669 ibd_tx_sched(state); 4670 } 4671 } else { 4672 state->id_tx_list.dl_head = node->swqe_next; 4673 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node)) 4674 state->id_tx_list.dl_tail = NULL; 4675 mutex_exit(&state->id_tx_list.dl_mutex); 4676 } 4677 4678 /* 4679 * Set return parameters. 4680 */ 4681 *fswqe = wqes; 4682 *lswqe = node; 4683 return (number - numwqe); 4684 } 4685 4686 typedef struct ibd_mpack_s { 4687 ibd_swqe_t *ip_swqe; 4688 uint32_t ip_start, ip_stuff, ip_flags; 4689 ibd_ace_t *ip_ace; 4690 boolean_t ip_copy; 4691 boolean_t ip_noresources; 4692 int ip_segs; 4693 ibt_mr_hdl_t ip_mhdl[IBD_MDTMAX_SEGS + 1]; 4694 ibt_mr_desc_t ip_mdsc[IBD_MDTMAX_SEGS + 1]; 4695 } ibd_mpack_t; 4696 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s)) 4697 4698 static void 4699 ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info) 4700 { 4701 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4702 ibd_mpack_t *ptx = (ibd_mpack_t *)cookie; 4703 ibd_ace_t *ace = ptx->ip_ace; 4704 ibd_swqe_t *wqes, *node = ptx->ip_swqe; 4705 boolean_t docopy = ptx->ip_copy; 4706 uchar_t *pptr; 4707 int i, pktsize, seglen, seg = 0; 4708 4709 /* 4710 * Snag the next wqe before we post this one, since it could complete 4711 * very fast and the wqe could get put at the end of the list, 4712 * corrupting our chain. Set up for the next packet. 4713 */ 4714 wqes = WQE_TO_SWQE(node->swqe_next); 4715 ptx->ip_swqe = wqes; 4716 4717 IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff, 4718 ptx->ip_flags); 4719 node->w_ahandle = ace; 4720 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4721 4722 if (docopy) { 4723 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 4724 pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 4725 pktsize = seglen = PDESC_HDRL(dl_pkt_info); 4726 if (seglen > 0) { 4727 bcopy(dl_pkt_info->hdr_rptr, pptr, seglen); 4728 pptr += seglen; 4729 } 4730 for (; seg < dl_pkt_info->pld_cnt; seg++) 4731 if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) { 4732 bcopy(dl_pkt_info->pld_ary[seg].pld_rptr, 4733 pptr, seglen); 4734 pptr += seglen; 4735 pktsize += seglen; 4736 } 4737 node->w_swr.wr_nds = 1; 4738 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 4739 } else { 4740 seglen = PDESC_HDRL(dl_pkt_info); 4741 if (seglen > 0) { 4742 node->w_smblk_sgl[seg].ds_va = 4743 (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr; 4744 node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey; 4745 node->w_smblk_sgl[seg].ds_len = seglen; 4746 seg++; 4747 } 4748 for (i = 0; i < dl_pkt_info->pld_cnt; i++) { 4749 if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) { 4750 node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t) 4751 (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr; 4752 node->w_smblk_sgl[seg].ds_key = 4753 ptx->ip_mdsc[dl_pkt_info-> 4754 pld_ary[i].pld_pbuf_idx + 1].md_lkey; 4755 node->w_smblk_sgl[seg].ds_len = seglen; 4756 seg++; 4757 } 4758 } 4759 node->w_swr.wr_sgl = node->w_smblk_sgl; 4760 node->w_swr.wr_nds = seg; 4761 } 4762 4763 if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) != 4764 IBT_SUCCESS) { 4765 /* 4766 * We never expect a failure here. But handle it, just in case. 4767 * If this is not the last packet, there are no problems; if 4768 * it is the last packet and the previous ones have not been 4769 * transmitted yet by the hardware, in the registration case, 4770 * the hardware might transmit garbage since we will be 4771 * freemsg'ing. The AH is still safe. 4772 */ 4773 DPRINT(5, "ibd_mdt_txone: posting failed"); 4774 ibd_tx_cleanup(state, node, B_TRUE); 4775 } 4776 } 4777 4778 static int 4779 ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie) 4780 { 4781 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4782 multidata_t *dlmdp = mmd_getmultidata(mp); 4783 ibd_mpack_t *mdinfo; 4784 mbufinfo_t bufinfo, *binfo = &bufinfo; 4785 pattrinfo_t attr_info; 4786 uchar_t *dlap; 4787 ibt_mr_attr_t mem_attr; 4788 ibd_swqe_t *wqes, *node; 4789 ipoib_mac_t *dest; 4790 size_t hsize, psize = 0; 4791 int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL); 4792 int i, ret; 4793 uint32_t end, value; 4794 boolean_t noresources = B_FALSE; 4795 4796 ASSERT(DB_TYPE(mp) == M_MULTIDATA); 4797 ASSERT(mp->b_cont == NULL); 4798 4799 if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0) 4800 return (0); 4801 else if (numwqes != numpackets) 4802 noresources = B_TRUE; 4803 4804 DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node); 4805 4806 /* 4807 * Allocate the cookie that will be passed to subsequent packet 4808 * transmit and post_mdt calls by GLD. We can not sleep, so if 4809 * there is no memory, just tell GLD to drop the entire MDT message. 4810 */ 4811 if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) { 4812 ibd_release_swqes(state, wqes, node, B_TRUE); 4813 return (-1); 4814 } 4815 *cookie = (void *)mdinfo; 4816 mdinfo->ip_noresources = noresources; 4817 4818 /* 4819 * Walk Global Attributes. If TCP failed to provide destination 4820 * information, or some interposing module removed the information, 4821 * fail the entire message. 4822 */ 4823 attr_info.type = PATTR_DSTADDRSAP; 4824 if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) { 4825 ibd_release_swqes(state, wqes, node, B_TRUE); 4826 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4827 return (-1); 4828 } 4829 dlap = ((pattr_addr_t *)attr_info.buf)->addr; 4830 dest = (ipoib_mac_t *)dlap; 4831 4832 /* 4833 * Get the AH for this destination, incrementing the posted 4834 * reference count properly. 4835 */ 4836 if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret, 4837 numwqes)) == NULL) { 4838 ibd_release_swqes(state, wqes, node, B_TRUE); 4839 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4840 return ((ret == GLD_FAILURE) ? -1 : 0); 4841 } 4842 4843 /* 4844 * Depending on how costly it is to copy vs register, we try to 4845 * register, falling back on copying if we fail. 4846 */ 4847 mmd_getregions(dlmdp, &bufinfo); 4848 hsize = binfo->hbuf_wptr - binfo->hbuf_rptr; 4849 for (i = 0; i < binfo->pbuf_cnt; i++) 4850 psize += (binfo->pbuf_ary[i].pbuf_wptr - 4851 binfo->pbuf_ary[i].pbuf_rptr); 4852 if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) { 4853 mdinfo->ip_segs = i + 1; 4854 if (hsize != 0) { 4855 mem_attr.mr_as = NULL; 4856 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4857 mem_attr.mr_vaddr = 4858 (uint64_t)(uintptr_t)binfo->hbuf_rptr; 4859 mem_attr.mr_len = hsize; 4860 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 4861 &mem_attr, &mdinfo->ip_mhdl[0], 4862 &mdinfo->ip_mdsc[0]) != IBT_SUCCESS) 4863 goto ibd_mdt_copy; 4864 DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize); 4865 } 4866 for (i = 0; i < binfo->pbuf_cnt; i++) { 4867 if ((psize = (binfo->pbuf_ary[i].pbuf_wptr - 4868 binfo->pbuf_ary[i].pbuf_rptr)) != 0) { 4869 mem_attr.mr_as = NULL; 4870 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4871 mem_attr.mr_vaddr = (uint64_t)(uintptr_t) 4872 binfo->pbuf_ary[i].pbuf_rptr; 4873 mem_attr.mr_len = psize; 4874 if (ibt_register_mr(state->id_hca_hdl, 4875 state->id_pd_hdl, &mem_attr, 4876 &mdinfo->ip_mhdl[i + 1], 4877 &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) { 4878 for (; i >= 0; i--) { 4879 (void) ibt_deregister_mr( 4880 state->id_hca_hdl, 4881 mdinfo->ip_mhdl[i]); 4882 } 4883 goto ibd_mdt_copy; 4884 } 4885 DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize); 4886 } 4887 } 4888 4889 mdinfo->ip_copy = B_FALSE; 4890 4891 /* 4892 * All the deregistration must happen once the last swqe 4893 * completes. 4894 */ 4895 node->swqe_im_mblk = mp; 4896 node->w_mdtinfo = mdinfo; 4897 DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node); 4898 } else { 4899 ibd_mdt_copy: 4900 mdinfo->ip_copy = B_TRUE; 4901 } 4902 4903 /* 4904 * Do checksum related work. 4905 */ 4906 IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff, 4907 &end, &value, &mdinfo->ip_flags); 4908 4909 mdinfo->ip_swqe = wqes; 4910 return (numwqes); 4911 } 4912 4913 /* ARGSUSED */ 4914 static void 4915 ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie) 4916 { 4917 ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie; 4918 4919 if (mdinfo->ip_copy) { 4920 if (!mdinfo->ip_noresources) 4921 freemsg(mp); 4922 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4923 } 4924 } 4925 4926 /* 4927 * GLD entry point for transmitting a datagram. 4928 * The passed in packet has this format: 4929 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 4930 */ 4931 static int 4932 ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) 4933 { 4934 ibt_status_t ibt_status; 4935 ibt_mr_attr_t mem_attr; 4936 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4937 ibd_ace_t *ace; 4938 ibd_swqe_t *node; 4939 ipoib_mac_t *dest; 4940 ipoib_ptxhdr_t *ipibp; 4941 ip6_t *ip6h; 4942 mblk_t *nmp = mp; 4943 uint_t pktsize; 4944 size_t blksize; 4945 uchar_t *bufp; 4946 int i, ret, len, nmblks = 1; 4947 boolean_t dofree = B_TRUE; 4948 4949 if (ibd_acquire_swqes(state, &node, &node, 1) == 0) 4950 return (GLD_NORESOURCES); 4951 4952 /* 4953 * Obtain an address handle for the destination. 4954 */ 4955 dest = (ipoib_mac_t *)mp->b_rptr; 4956 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 4957 node->w_ahandle = ace; 4958 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4959 } else { 4960 DPRINT(5, 4961 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 4962 ((ret == GLD_FAILURE) ? "failed" : "queued"), 4963 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 4964 htonl(dest->ipoib_gidpref[1]), 4965 htonl(dest->ipoib_gidsuff[0]), 4966 htonl(dest->ipoib_gidsuff[1])); 4967 node->w_ahandle = NULL; 4968 goto ibd_send_fail; 4969 } 4970 4971 /* 4972 * For ND6 packets, padding is at the front of the source lladdr. 4973 * Insert the padding at front. 4974 */ 4975 ipibp = (ipoib_ptxhdr_t *)mp->b_rptr; 4976 if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) { 4977 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) { 4978 if (!pullupmsg(mp, IPV6_HDR_LEN + 4979 sizeof (ipoib_ptxhdr_t))) { 4980 DPRINT(10, "ibd_send: pullupmsg failure "); 4981 ret = GLD_FAILURE; 4982 goto ibd_send_fail; 4983 } 4984 } 4985 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t)); 4986 len = ntohs(ip6h->ip6_plen); 4987 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 4988 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + 4989 IPV6_HDR_LEN + len) { 4990 if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) + 4991 IPV6_HDR_LEN + len)) { 4992 DPRINT(10, "ibd_send: pullupmsg " 4993 "failure "); 4994 ret = GLD_FAILURE; 4995 goto ibd_send_fail; 4996 } 4997 } 4998 /* LINTED: E_CONSTANT_CONDITION */ 4999 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5000 } 5001 } 5002 5003 mp->b_rptr += IPOIB_ADDRL; 5004 while (((nmp = nmp->b_cont) != NULL) && 5005 (++nmblks < (state->id_max_sqseg + 1))); 5006 pktsize = msgsize(mp); 5007 if (pktsize > state->id_mtu) { 5008 ret = GLD_BADARG; 5009 goto ibd_send_fail; 5010 } 5011 5012 /* 5013 * Do checksum related work. 5014 */ 5015 IBD_CKSUM_SEND(mp); 5016 5017 /* 5018 * Copy the data to preregistered buffers, or register the buffer. 5019 */ 5020 if ((nmblks <= state->id_max_sqseg) && 5021 (pktsize > IBD_TX_COPY_THRESHOLD)) { 5022 for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { 5023 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; 5024 mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; 5025 mem_attr.mr_as = NULL; 5026 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5027 ibt_status = ibt_register_mr(state->id_hca_hdl, 5028 state->id_pd_hdl, &mem_attr, 5029 &node->w_smblkbuf[i].im_mr_hdl, 5030 &node->w_smblkbuf[i].im_mr_desc); 5031 if (ibt_status != IBT_SUCCESS) { 5032 /* 5033 * We do not expect any error other than 5034 * IBT_INSUFF_RESOURCE. 5035 */ 5036 if (ibt_status != IBT_INSUFF_RESOURCE) 5037 DPRINT(10, "ibd_send:%d\n", 5038 "failed in ibt_register_mem()", 5039 ibt_status); 5040 DPRINT(5, "ibd_send: registration failed"); 5041 node->w_swr.wr_nds = i; 5042 /* 5043 * Deregister already registered memory; 5044 * fallback to copying the mblk. 5045 */ 5046 ibd_deregister_mr(state, node); 5047 goto ibd_copy_path; 5048 } 5049 node->w_smblk_sgl[i].ds_va = 5050 (ib_vaddr_t)(uintptr_t)nmp->b_rptr; 5051 node->w_smblk_sgl[i].ds_key = 5052 node->w_smblkbuf[i].im_mr_desc.md_lkey; 5053 node->w_smblk_sgl[i].ds_len = 5054 nmp->b_wptr - nmp->b_rptr; 5055 } 5056 node->swqe_im_mblk = mp; 5057 node->w_swr.wr_sgl = node->w_smblk_sgl; 5058 node->w_swr.wr_nds = nmblks; 5059 dofree = B_FALSE; 5060 } else { 5061 ibd_copy_path: 5062 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5063 node->w_swr.wr_nds = 1; 5064 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5065 5066 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5067 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 5068 blksize = MBLKL(nmp); 5069 bcopy(nmp->b_rptr, bufp, blksize); 5070 bufp += blksize; 5071 } 5072 } 5073 5074 /* 5075 * Queue the wqe to hardware. 5076 */ 5077 ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); 5078 if (ibt_status != IBT_SUCCESS) { 5079 /* 5080 * We should not fail here; but just in case we do, we 5081 * tell GLD about this error. 5082 */ 5083 ret = GLD_FAILURE; 5084 DPRINT(5, "ibd_send: posting failed"); 5085 goto ibd_send_fail; 5086 } 5087 5088 DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", 5089 INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), 5090 htonl(ace->ac_mac.ipoib_gidpref[0]), 5091 htonl(ace->ac_mac.ipoib_gidpref[1]), 5092 htonl(ace->ac_mac.ipoib_gidsuff[0]), 5093 htonl(ace->ac_mac.ipoib_gidsuff[1])); 5094 5095 if (dofree) 5096 freemsg(mp); 5097 5098 return (GLD_SUCCESS); 5099 5100 ibd_send_fail: 5101 ibd_tx_cleanup(state, node, B_TRUE); 5102 return (ret); 5103 } 5104 5105 /* 5106 * GLD entry point for handling interrupts. When using combined CQ, 5107 * this handles Tx and Rx completions. With separate CQs, this handles 5108 * only Rx completions. 5109 */ 5110 static uint_t 5111 ibd_intr(gld_mac_info_t *macinfo) 5112 { 5113 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 5114 5115 /* 5116 * Poll for completed entries; the CQ will not interrupt any 5117 * more for incoming (or transmitted) packets. 5118 */ 5119 ibd_poll_compq(state, state->id_rcq_hdl); 5120 5121 /* 5122 * Now enable CQ notifications; all packets that arrive now 5123 * (or complete transmission) will cause new interrupts. 5124 */ 5125 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 5126 IBT_SUCCESS) { 5127 /* 5128 * We do not expect a failure here. 5129 */ 5130 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5131 } 5132 5133 /* 5134 * Repoll to catch all packets that might have arrived after 5135 * we finished the first poll loop and before interrupts got 5136 * armed. 5137 */ 5138 ibd_poll_compq(state, state->id_rcq_hdl); 5139 5140 return (DDI_INTR_CLAIMED); 5141 } 5142 5143 /* 5144 * Common code for interrupt handling as well as for polling 5145 * for all completed wqe's while detaching. 5146 */ 5147 static void 5148 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5149 { 5150 ibd_wqe_t *wqe; 5151 ibt_wc_t *wc, *wcs; 5152 uint_t numwcs; 5153 int i; 5154 5155 /* 5156 * In some cases (eg detaching), this code can be invoked on 5157 * any cpu after disabling cq notification (thus no concurrency 5158 * exists). Apart from that, the following applies normally: 5159 * The receive completion handling is always on the Rx interrupt 5160 * cpu. Transmit completion handling could be from any cpu if 5161 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5162 * is interrupt driven. Combined completion handling is always 5163 * on the interrupt cpu. Thus, lock accordingly and use the 5164 * proper completion array. 5165 */ 5166 if (cq_hdl == state->id_rcq_hdl) 5167 wcs = state->id_wcs; 5168 else 5169 wcs = state->id_txwcs; 5170 5171 while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) { 5172 5173 for (i = 0, wc = wcs; i < numwcs; i++, wc++) { 5174 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5175 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5176 (wqe->w_type == IBD_WQE_RECV)); 5177 if (wc->wc_status != IBT_WC_SUCCESS) { 5178 /* 5179 * Channel being torn down. 5180 */ 5181 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5182 DPRINT(5, "ibd_intr: flush error"); 5183 /* 5184 * Only invoke the Tx handler to 5185 * release possibly held resources 5186 * like AH refcount etc. Can not 5187 * invoke Rx handler because it might 5188 * try adding buffers to the Rx pool 5189 * when we are trying to deinitialize. 5190 */ 5191 if (wqe->w_type == IBD_WQE_RECV) 5192 continue; 5193 } else { 5194 DPRINT(10, "%s %d", 5195 "ibd_intr: Bad CQ status", 5196 wc->wc_status); 5197 } 5198 } 5199 if (wqe->w_type == IBD_WQE_SEND) 5200 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe), 5201 B_FALSE); 5202 else 5203 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5204 } 5205 } 5206 } 5207 5208 /* 5209 * Deregister the mr associated with a given mblk. 5210 */ 5211 static void 5212 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) 5213 { 5214 int i; 5215 5216 DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, 5217 swqe->w_swr.wr_nds); 5218 /* 5219 * If this is an MDT case, process accordingly. 5220 */ 5221 if (swqe->w_mdtinfo != NULL) { 5222 ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo; 5223 5224 for (i = 0; i < mdinfo->ip_segs; i++) 5225 if ((mdinfo->ip_mhdl[i] != 0) && 5226 (ibt_deregister_mr(state->id_hca_hdl, 5227 mdinfo->ip_mhdl[i]) != IBT_SUCCESS)) 5228 DPRINT(10, "MDT deregistration failed\n"); 5229 ASSERT(!mdinfo->ip_copy); 5230 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 5231 swqe->w_mdtinfo = NULL; 5232 return; 5233 } 5234 5235 for (i = 0; i < swqe->w_swr.wr_nds; i++) { 5236 if (ibt_deregister_mr(state->id_hca_hdl, 5237 swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { 5238 /* 5239 * We do not expect any errors here. 5240 */ 5241 DPRINT(10, "failed in ibt_deregister_mem()\n"); 5242 } 5243 } 5244 } 5245 5246 /* 5247 * Common code that deals with clean ups after a successful or 5248 * erroneous transmission attempt. 5249 */ 5250 static void 5251 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) 5252 { 5253 ibd_ace_t *ace = swqe->w_ahandle; 5254 5255 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5256 5257 /* 5258 * If this was a dynamic registration in ibd_send() or in MDT, 5259 * deregister now. 5260 */ 5261 if (swqe->swqe_im_mblk != NULL) { 5262 ibd_deregister_mr(state, swqe); 5263 freemsg(swqe->swqe_im_mblk); 5264 swqe->swqe_im_mblk = NULL; 5265 } 5266 5267 /* 5268 * Drop the reference count on the AH; it can be reused 5269 * now for a different destination if there are no more 5270 * posted sends that will use it. This can be eliminated 5271 * if we can always associate each Tx buffer with an AH. 5272 * The ace can be null if we are cleaning up from the 5273 * ibd_send() error path. 5274 */ 5275 if (ace != NULL) { 5276 /* 5277 * The recycling logic can be eliminated from here 5278 * and put into the async thread if we create another 5279 * list to hold ACE's for unjoined mcg's. 5280 */ 5281 if (DEC_REF_DO_CYCLE(ace)) { 5282 ibd_mce_t *mce; 5283 5284 /* 5285 * Check with the lock taken: we decremented 5286 * reference count without the lock, and some 5287 * transmitter might alreay have bumped the 5288 * reference count (possible in case of multicast 5289 * disable when we leave the AH on the active 5290 * list). If not still 0, get out, leaving the 5291 * recycle bit intact. 5292 * 5293 * Atomically transition the AH from active 5294 * to free list, and queue a work request to 5295 * leave the group and destroy the mce. No 5296 * transmitter can be looking at the AH or 5297 * the MCE in between, since we have the 5298 * ac_mutex lock. In the SendOnly reap case, 5299 * it is not neccesary to hold the ac_mutex 5300 * and recheck the ref count (since the AH was 5301 * taken off the active list), we just do it 5302 * to have uniform processing with the Full 5303 * reap case. 5304 */ 5305 mutex_enter(&state->id_ac_mutex); 5306 mce = ace->ac_mce; 5307 if (GET_REF_CYCLE(ace) == 0) { 5308 CLEAR_REFCYCLE(ace); 5309 /* 5310 * Identify the case of fullmember reap as 5311 * opposed to mcg trap reap. Also, port up 5312 * might set ac_mce to NULL to indicate Tx 5313 * cleanup should do no more than put the 5314 * AH in the free list (see ibd_async_link). 5315 */ 5316 if (mce != NULL) { 5317 ace->ac_mce = NULL; 5318 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 5319 /* 5320 * mc_req was initialized at mce 5321 * creation time. 5322 */ 5323 ibd_queue_work_slot(state, 5324 &mce->mc_req, ASYNC_REAP); 5325 } 5326 IBD_ACACHE_INSERT_FREE(state, ace); 5327 } 5328 mutex_exit(&state->id_ac_mutex); 5329 } 5330 } 5331 5332 /* 5333 * Release the send wqe for reuse. 5334 */ 5335 ibd_release_swqes(state, swqe, swqe, send_context); 5336 } 5337 5338 /* 5339 * Processing to be done after receipt of a packet; hand off to GLD 5340 * in the format expected by GLD. 5341 * The recvd packet has this format: 2b sap :: 00 :: data. 5342 */ 5343 static void 5344 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5345 { 5346 ipoib_pgrh_t *pgrh; 5347 mblk_t *mp; 5348 ipoib_hdr_t *ipibp; 5349 ip6_t *ip6h; 5350 int rxcnt, len; 5351 5352 /* 5353 * Track number handed to upper layer, and number still 5354 * available to receive packets. 5355 */ 5356 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5357 ASSERT(rxcnt >= 0); 5358 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5359 5360 /* 5361 * Adjust write pointer depending on how much data came in. 5362 */ 5363 mp = rwqe->rwqe_im_mblk; 5364 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5365 5366 /* 5367 * If the GRH is not valid, indicate to GLD by setting 5368 * the VerTcFlow field to 0. Else, update the pseudoGRH 5369 * so that GLD can determine the source mac of the packet. 5370 */ 5371 pgrh = (ipoib_pgrh_t *)mp->b_rptr; 5372 if (wc->wc_flags & IBT_WC_GRH_PRESENT) 5373 pgrh->ipoib_sqpn = htonl(wc->wc_qpn); 5374 else 5375 pgrh->ipoib_vertcflow = 0; 5376 5377 DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); 5378 5379 /* 5380 * For ND6 packets, padding is at the front of the source/target 5381 * lladdr. However the inet6 layer is not aware of it, hence remove 5382 * the padding from such packets. 5383 */ 5384 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5385 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5386 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5387 if (!pullupmsg(mp, IPV6_HDR_LEN + 5388 sizeof (ipoib_hdr_t))) { 5389 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5390 freemsg(mp); 5391 return; 5392 } 5393 } 5394 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5395 len = ntohs(ip6h->ip6_plen); 5396 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5397 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5398 IPV6_HDR_LEN + len) { 5399 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5400 IPV6_HDR_LEN + len)) { 5401 DPRINT(10, "ibd_process_rx: pullupmsg" 5402 " failed"); 5403 freemsg(mp); 5404 return; 5405 } 5406 } 5407 /* LINTED: E_CONSTANT_CONDITION */ 5408 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5409 } 5410 } 5411 5412 /* 5413 * Hand off to service thread/GLD. When we have hardware that 5414 * does hardware checksum, we will pull the checksum from the 5415 * work completion structure here. 5416 * on interrupt cpu. 5417 */ 5418 ibd_send_up(state, mp); 5419 5420 /* 5421 * Possibly replenish the Rx pool if needed. 5422 */ 5423 if (rxcnt < IBD_RX_THRESHOLD) { 5424 state->id_rx_short++; 5425 if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { 5426 if (ibd_post_rwqe(state, rwqe, B_FALSE) == 5427 DDI_FAILURE) { 5428 ibd_free_rwqe(state, rwqe); 5429 return; 5430 } 5431 } 5432 } 5433 } 5434 5435 /* 5436 * Callback code invoked from STREAMs when the recv data buffer is free 5437 * for recycling. 5438 */ 5439 static void 5440 ibd_freemsg_cb(char *arg) 5441 { 5442 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5443 ibd_state_t *state = rwqe->w_state; 5444 5445 /* 5446 * If the wqe is being destructed, do not attempt recycling. 5447 */ 5448 if (rwqe->w_freeing_wqe == B_TRUE) { 5449 DPRINT(6, "ibd_freemsg_cb: wqe being freed"); 5450 return; 5451 } 5452 5453 /* 5454 * Upper layer has released held mblk. 5455 */ 5456 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5457 5458 if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { 5459 /* 5460 * There are already enough buffers on the Rx ring. 5461 * Free this one up. 5462 */ 5463 rwqe->rwqe_im_mblk = NULL; 5464 ibd_delete_rwqe(state, rwqe); 5465 ibd_free_rwqe(state, rwqe); 5466 DPRINT(6, "ibd_freemsg_cb: free up wqe"); 5467 } else { 5468 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5469 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5470 if (rwqe->rwqe_im_mblk == NULL) { 5471 ibd_delete_rwqe(state, rwqe); 5472 ibd_free_rwqe(state, rwqe); 5473 DPRINT(6, "ibd_freemsg_cb: desballoc failed"); 5474 return; 5475 } 5476 5477 /* 5478 * Post back to h/w. We could actually have more than 5479 * id_num_rwqe WQEs on the list if there were multiple 5480 * ibd_freemsg_cb() calls outstanding (since the lock is 5481 * not held the entire time). This will start getting 5482 * corrected over subsequent ibd_freemsg_cb() calls. 5483 */ 5484 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5485 ibd_delete_rwqe(state, rwqe); 5486 ibd_free_rwqe(state, rwqe); 5487 return; 5488 } 5489 } 5490 } 5491 5492 #ifdef RUN_PERFORMANCE 5493 5494 /* 5495 * To run the performance test, first do the "ifconfig ibdN plumb" on 5496 * the Rx and Tx side. Then use mdb -kw to tweak the following variables: 5497 * ibd_performance=1. 5498 * ibd_receiver=1 on Rx side. 5499 * ibd_sender=1 on Tx side. 5500 * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update 5501 * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will 5502 * make it drop into a 1 minute loop waiting for packets. An 5503 * ifconfig/unplumb on the Tx will cause it to send packets to Rx. 5504 */ 5505 5506 #define IBD_NUM_UNSIGNAL ibd_num_unsignal 5507 #define IBD_TX_PKTSIZE ibd_tx_pktsize 5508 #define IBD_TX_DATASIZE ibd_tx_datasize 5509 5510 static ibd_swqe_t **swqes; 5511 static ibt_wc_t *wcs; 5512 5513 /* 5514 * Set these on Rx and Tx side to do performance run. 5515 */ 5516 static int ibd_performance = 0; 5517 static int ibd_receiver = 0; 5518 static int ibd_sender = 0; 5519 static ipoib_mac_t ibd_dest; 5520 5521 /* 5522 * Interrupt coalescing is achieved by asking for a completion intr 5523 * only every ibd_num_unsignal'th packet. 5524 */ 5525 static int ibd_num_unsignal = 8; 5526 5527 /* 5528 * How big is each packet? 5529 */ 5530 static int ibd_tx_pktsize = 2048; 5531 5532 /* 5533 * Total data size to be transmitted. 5534 */ 5535 static int ibd_tx_datasize = 512*1024*1024; 5536 5537 static volatile boolean_t cq_handler_ran = B_FALSE; 5538 static volatile int num_completions; 5539 5540 /* ARGSUSED */ 5541 static void 5542 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) 5543 { 5544 ibd_state_t *state = (ibd_state_t *)arg; 5545 ibt_cq_hdl_t cqhdl; 5546 ibd_wqe_t *wqe; 5547 uint_t polled, i; 5548 boolean_t cq_enabled = B_FALSE; 5549 5550 if (ibd_receiver == 1) 5551 cqhdl = state->id_rcq_hdl; 5552 else 5553 cqhdl = state->id_scq_hdl; 5554 5555 /* 5556 * Mark the handler as having run and possibly freed up some 5557 * slots. Blocked sends can be retried. 5558 */ 5559 cq_handler_ran = B_TRUE; 5560 5561 repoll: 5562 while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == 5563 IBT_SUCCESS) { 5564 num_completions += polled; 5565 if (ibd_receiver == 1) { 5566 /* 5567 * We can immediately recycle the buffer. No 5568 * need to pass up to any IP layer ... 5569 */ 5570 for (i = 0; i < polled; i++) { 5571 wqe = (ibd_wqe_t *)wcs[i].wc_id; 5572 (void) ibt_post_recv(state->id_chnl_hdl, 5573 &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); 5574 } 5575 } 5576 } 5577 5578 /* 5579 * If we just repolled, we are done; exit. 5580 */ 5581 if (cq_enabled) 5582 return; 5583 5584 /* 5585 * Enable CQ. 5586 */ 5587 if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 5588 /* 5589 * We do not expect a failure here. 5590 */ 5591 cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); 5592 } 5593 cq_enabled = B_TRUE; 5594 5595 /* 5596 * Repoll for packets that came in after we finished previous 5597 * poll loop but before we turned on notifications. 5598 */ 5599 goto repoll; 5600 } 5601 5602 static void 5603 ibd_perf_tx(ibd_state_t *state) 5604 { 5605 ibt_mr_hdl_t mrhdl; 5606 ibt_mr_desc_t mrdesc; 5607 ibt_mr_attr_t mem_attr; 5608 ibt_status_t stat; 5609 ibd_ace_t *ace = NULL; 5610 ibd_swqe_t *node; 5611 uchar_t *sendbuf; 5612 longlong_t stime, etime; 5613 longlong_t sspin, espin, tspin = 0; 5614 int i, reps, packets; 5615 5616 cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", 5617 htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), 5618 htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), 5619 htonl(ibd_dest.ipoib_gidsuff[1])); 5620 if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || 5621 (ibd_dest.ipoib_gidpref[1] == 0)) { 5622 cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); 5623 return; 5624 } 5625 5626 packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); 5627 reps = (packets / IBD_NUM_SWQE); 5628 5629 cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); 5630 cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); 5631 cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); 5632 cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); 5633 cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); 5634 if ((packets % IBD_NUM_UNSIGNAL) != 0) { 5635 /* 5636 * This is required to ensure the last packet will trigger 5637 * a CQ handler callback, thus we can spin waiting fot all 5638 * packets to be received. 5639 */ 5640 cmn_err(CE_CONT, 5641 "ibd_perf_tx: #Packets not multiple of Signal Grp size"); 5642 return; 5643 } 5644 num_completions = 0; 5645 5646 swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, 5647 KM_NOSLEEP); 5648 if (swqes == NULL) { 5649 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5650 return; 5651 } 5652 5653 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5654 if (wcs == NULL) { 5655 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5656 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5657 return; 5658 } 5659 5660 /* 5661 * Get the ud_dest for the destination. 5662 */ 5663 ibd_async_acache(state, &ibd_dest); 5664 mutex_enter(&state->id_ac_mutex); 5665 ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); 5666 mutex_exit(&state->id_ac_mutex); 5667 if (ace == NULL) { 5668 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5669 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5670 cmn_err(CE_CONT, "ibd_perf_tx: no AH"); 5671 return; 5672 } 5673 5674 /* 5675 * Set up the send buffer. 5676 */ 5677 sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); 5678 if (sendbuf == NULL) { 5679 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5680 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5681 cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); 5682 return; 5683 } 5684 5685 /* 5686 * This buffer can be used in the case when we want to 5687 * send data from the same memory area over and over; 5688 * it might help in reducing memory traffic. 5689 */ 5690 mem_attr.mr_vaddr = (uint64_t)sendbuf; 5691 mem_attr.mr_len = IBD_TX_PKTSIZE; 5692 mem_attr.mr_as = NULL; 5693 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5694 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 5695 &mrhdl, &mrdesc) != IBT_SUCCESS) { 5696 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5697 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5698 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5699 cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); 5700 return; 5701 } 5702 5703 /* 5704 * Allocate private send wqe's. 5705 */ 5706 for (i = 0; i < IBD_NUM_SWQE; i++) { 5707 if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { 5708 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5709 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5710 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5711 cmn_err(CE_CONT, "ibd_alloc_swqe failure"); 5712 return; 5713 } 5714 node->w_ahandle = ace; 5715 #if 0 5716 node->w_smblkbuf[0].im_mr_hdl = mrhdl; 5717 node->w_smblkbuf[0].im_mr_desc = mrdesc; 5718 node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; 5719 node->w_smblk_sgl[0].ds_key = 5720 node->w_smblkbuf[0].im_mr_desc.md_lkey; 5721 node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; 5722 node->w_swr.wr_sgl = node->w_smblk_sgl; 5723 #else 5724 node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; 5725 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5726 #endif 5727 5728 /* 5729 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs 5730 * is marked to invoke the CQ handler. That is the only 5731 * way we come to know when the send queue can accept more 5732 * WRs. 5733 */ 5734 if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) 5735 node->w_swr.wr_flags = IBT_WR_NO_FLAGS; 5736 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5737 node->w_swr.wr_nds = 1; 5738 5739 swqes[i] = node; 5740 } 5741 5742 ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); 5743 5744 /* 5745 * Post all the requests. We expect this stream of post's will 5746 * not overwhelm the hardware due to periodic completions and 5747 * pollings that happen out of ibd_perf_handler. 5748 * Post a set of requests, till the channel can accept; after 5749 * that, wait for the CQ handler to notify us that there is more 5750 * space. 5751 */ 5752 stime = gethrtime(); 5753 for (; reps > 0; reps--) 5754 for (i = 0; i < IBD_NUM_SWQE; i++) { 5755 node = swqes[i]; 5756 retry: 5757 if ((stat = ibt_post_send(state->id_chnl_hdl, 5758 &node->w_swr, 1, NULL)) != IBT_SUCCESS) { 5759 if (stat == IBT_CHAN_FULL) { 5760 /* 5761 * Spin till the CQ handler runs 5762 * and then try again. 5763 */ 5764 sspin = gethrtime(); 5765 while (!cq_handler_ran); 5766 espin = gethrtime(); 5767 tspin += (espin - sspin); 5768 cq_handler_ran = B_FALSE; 5769 goto retry; 5770 } 5771 cmn_err(CE_CONT, "post failure %d/%d", stat, i); 5772 goto done; 5773 } 5774 } 5775 5776 done: 5777 /* 5778 * We should really be snapshotting when we get the last 5779 * completion. 5780 */ 5781 while (num_completions != (packets / IBD_NUM_UNSIGNAL)); 5782 etime = gethrtime(); 5783 5784 cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", 5785 num_completions); 5786 cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); 5787 cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); 5788 5789 /* 5790 * Wait a sec for everything to get over. 5791 */ 5792 delay(drv_usectohz(2000000)); 5793 5794 /* 5795 * Reset CQ handler to real one; free resources. 5796 */ 5797 if (ibd_separate_cqs == 0) { 5798 ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); 5799 } else { 5800 if (ibd_txcomp_poll == 0) 5801 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, 5802 state); 5803 else 5804 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5805 } 5806 5807 for (i = 0; i < IBD_NUM_SWQE; i++) 5808 ibd_free_swqe(state, swqes[i]); 5809 (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); 5810 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5811 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5812 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5813 } 5814 5815 static void 5816 ibd_perf_rx(ibd_state_t *state) 5817 { 5818 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5819 if (wcs == NULL) { 5820 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5821 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5822 return; 5823 } 5824 5825 /* 5826 * We do not need to allocate private recv wqe's. We will 5827 * just use the regular ones. 5828 */ 5829 5830 num_completions = 0; 5831 ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); 5832 5833 /* 5834 * Delay for a minute for all the packets to come in from 5835 * transmitter. 5836 */ 5837 cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); 5838 delay(drv_usectohz(60000000)); 5839 cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); 5840 5841 /* 5842 * Reset CQ handler to real one; free resources. 5843 */ 5844 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5845 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5846 } 5847 5848 static void 5849 ibd_perf(ibd_state_t *state) 5850 { 5851 if (ibd_performance == 0) 5852 return; 5853 5854 if (ibd_receiver == 1) { 5855 ibd_perf_rx(state); 5856 return; 5857 } 5858 5859 if (ibd_sender == 1) { 5860 ibd_perf_tx(state); 5861 return; 5862 } 5863 } 5864 5865 #endif /* RUN_PERFORMANCE */ 5866