1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * An implementation of the IPoIB standard based on PSARC 2001/289. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/conf.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/modctl.h> 38 #include <sys/stropts.h> 39 #include <sys/stream.h> 40 #include <sys/strsun.h> 41 #include <sys/strsubr.h> 42 #include <sys/dlpi.h> 43 44 #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ 45 #include <sys/sysmacros.h> /* for offsetof */ 46 #include <sys/disp.h> /* for async thread pri */ 47 #include <sys/atomic.h> /* for atomic_add*() */ 48 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 49 #include <netinet/in.h> /* for netinet/ip.h below */ 50 #include <netinet/ip.h> /* for struct ip */ 51 #include <netinet/udp.h> /* for struct udphdr */ 52 #include <inet/common.h> /* for inet/ip.h below */ 53 #include <inet/ip.h> /* for ipha_t */ 54 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 55 #include <inet/ip6.h> /* for ip6_t */ 56 #include <netinet/icmp6.h> /* for icmp6_t */ 57 #include <sys/callb.h> 58 #include <sys/modhash.h> 59 60 #include <sys/ib/clients/ibd/ibd.h> 61 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 62 #include <sys/note.h> 63 #include <sys/pattr.h> 64 #include <sys/multidata.h> 65 66 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 67 68 /* 69 * Modes of hardware/driver/software checksum, useful for debugging 70 * and performance studies. 71 * 72 * none: h/w (Tavor) and driver does not do checksum, IP software must. 73 * partial: driver does data checksum, IP must provide psuedo header. 74 * perf_partial: driver uses IP provided psuedo cksum as data checksum 75 * (thus, real checksumming is not done). 76 */ 77 typedef enum { 78 IBD_CSUM_NONE, 79 IBD_CSUM_PARTIAL, 80 IBD_CSUM_PERF_PARTIAL 81 } ibd_csum_type_t; 82 83 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; 84 85 /* 86 * Per interface tunable parameters. 87 */ 88 static uint_t ibd_rx_threshold = 16; 89 static uint_t ibd_tx_current_copy_threshold = 0x10000000; 90 static uint_t ibd_num_rwqe = 4095; /* 1 less than max Tavor CQsize */ 91 static uint_t ibd_num_swqe = 4095; /* 1 less than max Tavor CQsize */ 92 static uint_t ibd_num_ah = 16; 93 static uint_t ibd_hash_size = 16; 94 static uint_t ibd_srv_fifos = 0xffff; 95 static uint_t ibd_fifo_depth = 0; 96 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; 97 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; 98 99 /* 100 * The driver can use separate CQs for send and receive queueus. 101 * While using separate CQs, it is possible to put the send CQ 102 * in polling mode, ie not to enable notifications on that CQ. 103 * If both CQs are interrupt driven, currently it is not possible 104 * for their handlers to be invoked concurrently (since Tavor ties 105 * both interrupts to the same PCI intr line); but the handlers 106 * are not coded with a single interrupt cpu assumption (eg 107 * id_num_intrs is incremented atomically). 108 * 109 * The driver private struct uses id_scq_hdl to track the separate 110 * CQ being used for send; the id_rcq_hdl tracks the receive CQ 111 * if using separate CQs, or it tracks the single CQ when using 112 * combined CQ. The id_wcs completion array is used in the combined 113 * CQ case, and for fetching Rx completions in the separate CQs case; 114 * the id_txwcs is used to fetch Tx completions in the separate CQs 115 * case. 116 */ 117 static uint_t ibd_separate_cqs = 1; 118 static uint_t ibd_txcomp_poll = 0; 119 120 /* 121 * Initial number of IBA resources allocated. 122 */ 123 #define IBD_NUM_RWQE ibd_num_rwqe 124 #define IBD_NUM_SWQE ibd_num_swqe 125 #define IBD_NUM_AH ibd_num_ah 126 127 /* when <= threshold, it's faster to copy to a premapped buffer */ 128 #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold 129 130 /* 131 * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will 132 * allocate a new WQE to put on the the rxlist. This value must be <= 133 * IBD_NUM_RWQE/id_num_rwqe. 134 */ 135 #define IBD_RX_THRESHOLD ibd_rx_threshold 136 137 /* 138 * Hash table size for the active AH list. 139 */ 140 #define IBD_HASH_SIZE ibd_hash_size 141 142 /* 143 * Size of completion array to be filled by a single poll call. 144 */ 145 #define IBD_WC_SIZE 16 146 147 /* 148 * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This 149 * is based on our above completion array size. 150 */ 151 #define IBD_TXPOLL_MASK 0xf 152 153 /* 154 * Number of payload areas the MDT code can support. Choose the same value 155 * that we know is supported by TCP/MDT. 156 */ 157 #define IBD_MDTMAX_SEGS 16 158 159 /* 160 * PAD routine called during send/recv context 161 */ 162 #define IBD_SEND 0 163 #define IBD_RECV 1 164 165 /* Driver State Pointer */ 166 void *ibd_list; 167 168 /* Required system entry points */ 169 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 170 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 171 172 /* Required driver entry points for GLD */ 173 static int ibd_reset(gld_mac_info_t *); 174 static int ibd_start(gld_mac_info_t *); 175 static int ibd_stop(gld_mac_info_t *); 176 static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *); 177 static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int); 178 static int ibd_set_promiscuous(gld_mac_info_t *, int); 179 static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *); 180 static int ibd_send(gld_mac_info_t *, mblk_t *); 181 static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **); 182 static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *); 183 static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *); 184 static uint_t ibd_intr(gld_mac_info_t *); 185 186 /* Private driver entry points for GLD */ 187 static int ibd_state_init(ibd_state_t *, dev_info_t *); 188 static void ibd_state_fini(ibd_state_t *); 189 static int ibd_drv_init(ibd_state_t *); 190 static void ibd_drv_fini(ibd_state_t *); 191 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 192 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 193 static void ibd_snet_notices_handler(void *, ib_gid_t, 194 ibt_subnet_event_code_t, ibt_subnet_event_t *); 195 static int ibd_init_txlist(ibd_state_t *); 196 static void ibd_fini_txlist(ibd_state_t *); 197 static int ibd_init_rxlist(ibd_state_t *); 198 static void ibd_fini_rxlist(ibd_state_t *); 199 static void ibd_freemsg_cb(char *); 200 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t); 201 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 202 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); 203 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 204 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 205 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 206 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 207 ibt_async_event_t *); 208 static int ibd_acache_init(ibd_state_t *); 209 static void ibd_acache_fini(ibd_state_t *); 210 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 211 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 212 static void ibd_async_unsetprom(ibd_state_t *, boolean_t); 213 static void ibd_async_setprom(ibd_state_t *, boolean_t); 214 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 215 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 216 static void ibd_async_txsched(ibd_state_t *); 217 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 218 static void ibd_async_work(ibd_state_t *); 219 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 220 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 221 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 222 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, 223 ipoib_mac_t *); 224 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 225 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); 226 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 227 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 228 static uint64_t ibd_get_portspeed(ibd_state_t *); 229 230 #ifdef RUN_PERFORMANCE 231 static void ibd_perf(ibd_state_t *); 232 #endif 233 234 /* Streams Module Info */ 235 static struct module_info ibd_minfo = { 236 IBD_IDNUM, /* module ID Number */ 237 "ibd", /* module name */ 238 0, /* min packet size */ 239 INFPSZ, /* maximum packet size */ 240 IBD_HIWAT, /* high water mark */ 241 IBD_LOWAT /* low water mark */ 242 }; 243 244 /* Streams Read Queue */ 245 static struct qinit ibd_rdinit = { 246 NULL, /* put */ 247 gld_rsrv, /* service */ 248 gld_open, /* open */ 249 gld_close, /* close */ 250 NULL, /* unused */ 251 &ibd_minfo, /* parameters */ 252 NULL /* statistics */ 253 }; 254 255 /* Streams Write Queue */ 256 static struct qinit ibd_wrinit = { 257 gld_wput, /* put */ 258 gld_wsrv, /* service */ 259 NULL, /* open */ 260 NULL, /* close */ 261 NULL, /* unused */ 262 &ibd_minfo, /* parameters */ 263 NULL /* statistics */ 264 }; 265 266 /* Stream Operations */ 267 static struct streamtab ibd_streamtab = { 268 &ibd_rdinit, /* read queue */ 269 &ibd_wrinit, /* write queue */ 270 NULL, /* lower read queue (MUX) */ 271 NULL /* lower write queue (MUX) */ 272 }; 273 274 /* Character/Block Operations */ 275 static struct cb_ops ibd_cb_ops = { 276 nulldev, /* open */ 277 nulldev, /* close */ 278 nodev, /* strategy (block) */ 279 nodev, /* print (block) */ 280 nodev, /* dump (block) */ 281 nodev, /* read */ 282 nodev, /* write */ 283 nodev, /* ioctl */ 284 nodev, /* devmap */ 285 nodev, /* mmap */ 286 nodev, /* segmap */ 287 nochpoll, /* chpoll */ 288 ddi_prop_op, /* prop_op */ 289 &ibd_streamtab, /* streams */ 290 D_MP | D_64BIT, /* flags */ 291 CB_REV /* rev */ 292 }; 293 294 /* Driver Operations */ 295 static struct dev_ops ibd_dev_ops = { 296 DEVO_REV, /* struct rev */ 297 0, /* refcnt */ 298 gld_getinfo, /* getinfo */ 299 nulldev, /* identify */ 300 nulldev, /* probe */ 301 ibd_attach, /* attach */ 302 ibd_detach, /* detach */ 303 nodev, /* reset */ 304 &ibd_cb_ops, /* cb_ops */ 305 NULL, /* bus_ops */ 306 nodev /* power */ 307 }; 308 309 /* Module Driver Info */ 310 static struct modldrv ibd_modldrv = { 311 &mod_driverops, 312 "InfiniBand DLPI Driver %I%", 313 &ibd_dev_ops 314 }; 315 316 /* Module Linkage */ 317 static struct modlinkage ibd_modlinkage = { 318 MODREV_1, 319 &ibd_modldrv, 320 NULL 321 }; 322 323 /* 324 * Module Info passed to IBTL during IBT_ATTACH. 325 * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this 326 * data). 327 */ 328 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 329 IBTI_V2, 330 IBT_NETWORK, 331 ibd_async_handler, 332 NULL, 333 "IPIB" 334 }; 335 336 /* 337 * Async operation types. 338 */ 339 #define ASYNC_GETAH 1 340 #define ASYNC_JOIN 2 341 #define ASYNC_LEAVE 3 342 #define ASYNC_PROMON 4 343 #define ASYNC_PROMOFF 5 344 #define ASYNC_REAP 6 345 #define ASYNC_POKE 7 346 #define ASYNC_TRAP 8 347 #define ASYNC_SCHED 9 348 #define ASYNC_LINK 10 349 #define ASYNC_EXIT 11 350 351 /* 352 * Async operation states 353 */ 354 #define NOTSTARTED 0 355 #define ONGOING 1 356 #define COMPLETED 2 357 #define ERRORED 3 358 #define ROUTERED 4 359 360 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF 361 362 #ifdef DEBUG 363 364 static int rxpack = 1, txpack = 1; 365 int debuglevel = 100; 366 static void 367 debug_print(int l, char *fmt, ...) 368 { 369 va_list ap; 370 371 if (l < debuglevel) 372 return; 373 va_start(ap, fmt); 374 vcmn_err(CE_CONT, fmt, ap); 375 va_end(ap); 376 } 377 #define INCRXPACK (rxpack++) 378 #define INCTXPACK (txpack++) 379 #define DPRINT debug_print 380 381 #else /* DEBUG */ 382 383 #define INCRXPACK 0 384 #define INCTXPACK 0 385 #define DPRINT 386 387 #endif /* DEBUG */ 388 389 /* 390 * Common routine to print warning messages; adds in hca guid, port number 391 * and pkey to be able to identify the IBA interface. 392 */ 393 static void 394 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 395 { 396 ib_guid_t hca_guid; 397 char ibd_print_buf[256]; 398 int len; 399 va_list ap; 400 401 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 402 0, "hca-guid", 0); 403 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 404 "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname, 405 state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid, 406 state->id_port, state->id_pkey); 407 va_start(ap, fmt); 408 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 409 fmt, ap); 410 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 411 va_end(ap); 412 } 413 414 /* warlock directives */ 415 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 416 ibd_state_t::id_ah_active)) 417 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) 418 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 419 ibd_state_t::id_req_list)) 420 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 421 ibd_state_t::id_acache_req_cv)) 422 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 423 ibd_state_t::id_multi_req)) 424 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 425 ibd_state_t::id_multi_addr)) 426 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 427 ibd_state_t::id_multi_op)) 428 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 429 ibd_state_t::id_multi_queued)) 430 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 431 ibd_state_t::id_mc_full)) 432 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 433 ibd_state_t::id_mc_non)) 434 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 435 ibd_state_t::id_link_state)) 436 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 437 ibd_state_s::id_tx_list)) 438 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 439 ibd_state_s::id_rx_list)) 440 441 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op)) 442 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) 443 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) 444 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) 445 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) 446 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) 447 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) 448 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) 449 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) 450 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) 451 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) 452 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) 453 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) 454 455 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) 456 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) 457 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) 458 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) 459 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) 460 461 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) 462 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) 463 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) 464 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) 465 466 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) 467 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) 468 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats)) 469 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) 470 471 #ifdef DEBUG 472 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) 473 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) 474 #endif 475 476 int 477 _init() 478 { 479 int status; 480 481 /* 482 * Sanity check some parameter settings. Tx completion polling 483 * only makes sense with separate CQs for Tx and Rx. 484 */ 485 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 486 cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname, 487 "Setting ibd_txcomp_poll = 0 for combined CQ"); 488 ibd_txcomp_poll = 0; 489 } 490 491 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 492 if (status != 0) { 493 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 494 return (status); 495 } 496 497 status = mod_install(&ibd_modlinkage); 498 if (status != 0) { 499 DPRINT(10, "_init:failed in mod_install()"); 500 ddi_soft_state_fini(&ibd_list); 501 return (status); 502 } 503 504 return (0); 505 } 506 507 int 508 _info(struct modinfo *modinfop) 509 { 510 return (mod_info(&ibd_modlinkage, modinfop)); 511 } 512 513 int 514 _fini() 515 { 516 int status; 517 518 status = mod_remove(&ibd_modlinkage); 519 if (status != 0) 520 return (status); 521 522 ddi_soft_state_fini(&ibd_list); 523 return (0); 524 } 525 526 /* 527 * Convert the GID part of the mac address from network byte order 528 * to host order. 529 */ 530 static void 531 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 532 { 533 ib_sn_prefix_t nbopref; 534 ib_guid_t nboguid; 535 536 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 537 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 538 dgid->gid_prefix = b2h64(nbopref); 539 dgid->gid_guid = b2h64(nboguid); 540 } 541 542 /* 543 * Create the IPoIB address in network byte order from host order inputs. 544 */ 545 static void 546 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 547 ib_guid_t guid) 548 { 549 ib_sn_prefix_t nbopref; 550 ib_guid_t nboguid; 551 552 mac->ipoib_qpn = htonl(qpn); 553 nbopref = h2b64(prefix); 554 nboguid = h2b64(guid); 555 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 556 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 557 } 558 559 /* 560 * Send to the appropriate all-routers group when the IBA multicast group 561 * does not exist, based on whether the target group is v4 or v6. 562 */ 563 static boolean_t 564 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 565 ipoib_mac_t *rmac) 566 { 567 boolean_t retval = B_TRUE; 568 uint32_t adjscope = state->id_scope << 16; 569 uint32_t topword; 570 571 /* 572 * Copy the first 4 bytes in without assuming any alignment of 573 * input mac address; this will have IPoIB signature, flags and 574 * scope bits. 575 */ 576 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 577 topword = ntohl(topword); 578 579 /* 580 * Generate proper address for IPv4/v6, adding in the Pkey properly. 581 */ 582 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 583 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 584 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 585 ((uint32_t)(state->id_pkey << 16))), 586 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 587 else 588 /* 589 * Does not have proper bits in the mgid address. 590 */ 591 retval = B_FALSE; 592 593 return (retval); 594 } 595 596 /* 597 * Implementation of various (software) flavors of send and receive side 598 * checksumming. 599 */ 600 #define IBD_CKSUM_SEND(mp) { \ 601 uint32_t start, stuff, end, value, flags; \ 602 uint32_t cksum, sum; \ 603 uchar_t *dp, *buf; \ 604 uint16_t *up; \ 605 \ 606 if (ibd_csum_send == IBD_CSUM_NONE) \ 607 goto punt_send; \ 608 \ 609 /* \ 610 * Query IP whether Tx cksum needs to be done. \ 611 */ \ 612 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ 613 &value, &flags); \ 614 \ 615 if (flags == HCK_PARTIALCKSUM) { \ 616 dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ 617 up = (uint16_t *)(dp + stuff); \ 618 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 619 end = ((uchar_t *)mp->b_wptr - dp - start); \ 620 cksum = *up; \ 621 *up = 0; \ 622 /* \ 623 * Does NOT handle chained mblks/more than one \ 624 * SGL. Applicable only for a single SGL \ 625 * entry/mblk, where the stuff offset is \ 626 * within the range of buf. \ 627 */ \ 628 buf = (dp + start); \ 629 sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ 630 } else { \ 631 sum = *up; \ 632 } \ 633 DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ 634 start, stuff, end, sum, cksum); \ 635 sum = ~(sum); \ 636 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 637 } \ 638 punt_send: \ 639 ; \ 640 } 641 642 #define IBD_CKSUM_RECV(mp) { \ 643 uchar_t *dp, *buf; \ 644 uint32_t start, end, value, stuff, flags; \ 645 uint16_t *up, frag; \ 646 ipha_t *iphp; \ 647 ipoib_hdr_t *ipibh; \ 648 \ 649 if (ibd_csum_recv == IBD_CSUM_NONE) \ 650 goto punt_recv; \ 651 \ 652 ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ 653 if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ 654 goto punt_recv; \ 655 \ 656 dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ 657 iphp = (ipha_t *)dp; \ 658 frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ 659 if ((frag) & (~IPH_DF)) \ 660 goto punt_recv; \ 661 start = IPH_HDR_LENGTH(iphp); \ 662 if (iphp->ipha_protocol == IPPROTO_TCP) \ 663 stuff = start + 16; \ 664 else if (iphp->ipha_protocol == IPPROTO_UDP) \ 665 stuff = start + 6; \ 666 else \ 667 goto punt_recv; \ 668 \ 669 flags = HCK_PARTIALCKSUM; \ 670 end = ntohs(iphp->ipha_length); \ 671 up = (uint16_t *)(dp + stuff); \ 672 \ 673 if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ 674 buf = (dp + start); \ 675 value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ 676 } else { \ 677 value = (*up); \ 678 } \ 679 if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ 680 value, flags, 0) != 0) \ 681 DPRINT(10, "cksum_recv: value: %x\n", value); \ 682 punt_recv: \ 683 ; \ 684 } 685 686 #define IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) { \ 687 /* \ 688 * Query IP whether Tx cksum needs to be done. \ 689 */ \ 690 if (ibd_csum_send != IBD_CSUM_NONE) \ 691 hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp); \ 692 } 693 694 #define IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) { \ 695 if ((ibd_csum_send != IBD_CSUM_NONE) && \ 696 (fl == HCK_PARTIALCKSUM)) { \ 697 extern uint_t bcksum(uchar_t *, int, uint32_t); \ 698 uint16_t *up; \ 699 uint32_t sum; \ 700 uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE; \ 701 int k; \ 702 \ 703 up = (uint16_t *)(hp + stf); \ 704 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 705 sum = *up; \ 706 *up = 0; \ 707 sum = IP_BCSUM_PARTIAL(hp + st, \ 708 PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE, \ 709 sum); \ 710 for (k = 0; k < pinfo->pld_cnt; k++) \ 711 sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\ 712 pld_rptr, PDESC_PLDL(pinfo, k), \ 713 sum); \ 714 } else { \ 715 sum = *up; \ 716 } \ 717 sum = ~(sum); \ 718 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 719 } \ 720 } 721 722 /* 723 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 724 * front of optional src/tgt link layer address. Right now Solaris inserts 725 * padding by default at the end. The routine which is doing is nce_xmit() 726 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 727 * the packet comes down from IP layer to the IBD driver, it is in the 728 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 729 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 730 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 731 * 732 * The send routine at IBD driver changes this packet as follows: 733 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 734 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 735 * aligned. 736 * 737 * At the receiving side again ibd_process_rx takes the above packet and 738 * removes the two bytes of front padding and inserts it at the end. This 739 * is since the IP layer does not understand padding at the front. 740 */ 741 #define IBD_PAD_NSNA(ip6h, len, type) { \ 742 uchar_t *nd_lla_ptr; \ 743 icmp6_t *icmp6; \ 744 nd_opt_hdr_t *opt; \ 745 int i; \ 746 \ 747 icmp6 = (icmp6_t *)&ip6h[1]; \ 748 len -= sizeof (nd_neighbor_advert_t); \ 749 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 750 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 751 (len != 0)) { \ 752 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 753 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 754 ASSERT(opt != NULL); \ 755 nd_lla_ptr = (uchar_t *)&opt[1]; \ 756 if (type == 0) { \ 757 for (i = IPOIB_ADDRL; i > 0; i--) \ 758 *(nd_lla_ptr + i + 1) = \ 759 *(nd_lla_ptr + i - 1); \ 760 } else { \ 761 for (i = 0; i < IPOIB_ADDRL; i++) \ 762 *(nd_lla_ptr + i) = \ 763 *(nd_lla_ptr + i + 2); \ 764 } \ 765 *(nd_lla_ptr + i) = 0; \ 766 *(nd_lla_ptr + i + 1) = 0; \ 767 } \ 768 } 769 770 /* 771 * The service fifo code is copied verbatim from Cassini. This can be 772 * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. 773 */ 774 775 typedef caddr_t fifo_obj_t, *p_fifo_obj_t; 776 777 typedef struct _srv_fifo_t { 778 kmutex_t fifo_lock; 779 kcondvar_t fifo_cv; 780 size_t size; 781 uint_t max_index; 782 uint_t rd_index; 783 uint_t wr_index; 784 uint_t objs_pending; 785 p_fifo_obj_t fifo_objs; 786 kthread_t *fifo_thread; 787 void (*drain_func)(caddr_t drain_func_arg); 788 caddr_t drain_func_arg; 789 boolean_t running; 790 callb_cpr_t cprinfo; 791 } srv_fifo_t, *p_srv_fifo_t; 792 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) 793 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) 794 795 static int 796 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, 797 void (*drain_func)(), caddr_t drain_func_arg) 798 { 799 int status; 800 p_srv_fifo_t srv_fifo; 801 802 status = DDI_SUCCESS; 803 srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); 804 srv_fifo->size = size; 805 srv_fifo->max_index = size - 1; 806 srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( 807 size * sizeof (fifo_obj_t), KM_SLEEP); 808 mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); 809 cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); 810 srv_fifo->drain_func = drain_func; 811 srv_fifo->drain_func_arg = drain_func_arg; 812 srv_fifo->running = DDI_SUCCESS; 813 srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, 814 (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); 815 if (srv_fifo->fifo_thread == NULL) { 816 cv_destroy(&srv_fifo->fifo_cv); 817 mutex_destroy(&srv_fifo->fifo_lock); 818 kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); 819 kmem_free(srv_fifo, sizeof (srv_fifo_t)); 820 srv_fifo = NULL; 821 status = DDI_FAILURE; 822 } else 823 *handle = srv_fifo; 824 return (status); 825 } 826 827 static void 828 _ddi_srv_fifo_destroy(p_srv_fifo_t handle) 829 { 830 kt_did_t tid = handle->fifo_thread->t_did; 831 832 mutex_enter(&handle->fifo_lock); 833 handle->running = DDI_FAILURE; 834 cv_signal(&handle->fifo_cv); 835 while (handle->running == DDI_FAILURE) 836 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 837 mutex_exit(&handle->fifo_lock); 838 if (handle->objs_pending != 0) 839 cmn_err(CE_NOTE, "!Thread Exit with work undone."); 840 cv_destroy(&handle->fifo_cv); 841 mutex_destroy(&handle->fifo_lock); 842 kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); 843 kmem_free(handle, sizeof (srv_fifo_t)); 844 thread_join(tid); 845 } 846 847 static caddr_t 848 _ddi_srv_fifo_begin(p_srv_fifo_t handle) 849 { 850 #ifndef __lock_lint 851 CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, 852 callb_generic_cpr, "srv_fifo"); 853 #endif /* ! _lock_lint */ 854 return (handle->drain_func_arg); 855 } 856 857 static void 858 _ddi_srv_fifo_end(p_srv_fifo_t handle) 859 { 860 callb_cpr_t cprinfo; 861 862 mutex_enter(&handle->fifo_lock); 863 cprinfo = handle->cprinfo; 864 handle->running = DDI_SUCCESS; 865 cv_signal(&handle->fifo_cv); 866 #ifndef __lock_lint 867 CALLB_CPR_EXIT(&cprinfo); 868 #endif /* ! _lock_lint */ 869 thread_exit(); 870 _NOTE(NOT_REACHED) 871 } 872 873 static int 874 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) 875 { 876 int status; 877 878 mutex_enter(&handle->fifo_lock); 879 status = handle->running; 880 if (status == DDI_SUCCESS) { 881 if (ptr) { 882 if (handle->objs_pending < handle->size) { 883 if (handle->wr_index == handle->max_index) 884 handle->wr_index = 0; 885 else 886 handle->wr_index++; 887 handle->fifo_objs[handle->wr_index] = ptr; 888 handle->objs_pending++; 889 } else 890 status = DDI_FAILURE; 891 if (signal) 892 cv_signal(&handle->fifo_cv); 893 } else { 894 if (signal && (handle->objs_pending > 0)) 895 cv_signal(&handle->fifo_cv); 896 } 897 } 898 mutex_exit(&handle->fifo_lock); 899 return (status); 900 } 901 902 static int 903 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) 904 { 905 int status; 906 907 mutex_enter(&handle->fifo_lock); 908 status = handle->running; 909 if (status == DDI_SUCCESS) { 910 if (handle->objs_pending == 0) { 911 #ifndef __lock_lint 912 CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); 913 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 914 CALLB_CPR_SAFE_END(&handle->cprinfo, 915 &handle->fifo_lock); 916 #endif /* !_lock_lint */ 917 *ptr = NULL; 918 } 919 if (handle->objs_pending > 0) { 920 if (handle->rd_index == handle->max_index) 921 handle->rd_index = 0; 922 else 923 handle->rd_index++; 924 *ptr = handle->fifo_objs[handle->rd_index]; 925 handle->objs_pending--; 926 } 927 status = handle->running; 928 } else { 929 if (handle->objs_pending) { 930 if (handle->rd_index == handle->max_index) 931 handle->rd_index = 0; 932 else 933 handle->rd_index++; 934 *ptr = handle->fifo_objs[handle->rd_index]; 935 handle->objs_pending--; 936 status = DDI_SUCCESS; 937 } else 938 status = DDI_FAILURE; 939 } 940 mutex_exit(&handle->fifo_lock); 941 return (status); 942 } 943 944 /* 945 * [un]map_rx_srv_fifos has been modified from its CE version. 946 */ 947 static void 948 drain_fifo(p_srv_fifo_t handle) 949 { 950 ibd_state_t *state; 951 mblk_t *mp; 952 953 state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); 954 while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { 955 /* 956 * Hand off to GLD. 957 */ 958 IBD_CKSUM_RECV(mp); 959 gld_recv(state->id_macinfo, mp); 960 } 961 _ddi_srv_fifo_end(handle); 962 } 963 964 static p_srv_fifo_t * 965 map_rx_srv_fifos(int *nfifos, void *private) 966 { 967 p_srv_fifo_t *srv_fifos; 968 int i, inst_taskqs, depth; 969 970 /* 971 * Default behavior on sparc cpus (with lower cpu frequency) is 972 * to use service fifo if ncpus > 1 and not to use service fifo 973 * on single cpu systems; on intel/amd cpus (with higher cpu 974 * frequency), the default is never to use service fifos. This 975 * can be changed by tweaking ibd_srv_fifos (set to 0 or 1 976 * by administrator). On single cpu systems, network 977 * processing is given lower priority if using service 978 * threads, thus possibly making the system more usable 979 * at high network loads (maybe by throttling network 980 * throughput). 981 */ 982 if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || 983 #if !defined(__sparc) 984 (ibd_srv_fifos == 0xffff) || 985 #endif 986 (ibd_srv_fifos == 0)) { 987 *nfifos = 0; 988 return ((p_srv_fifo_t *)1); 989 } 990 991 *nfifos = inst_taskqs; 992 srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), 993 KM_SLEEP); 994 995 /* 996 * If the administrator has specified a fifo depth, use 997 * that, else just decide what should be the depth. 998 */ 999 if (ibd_fifo_depth == 0) 1000 depth = (IBD_NUM_RWQE / inst_taskqs) + 16; 1001 else 1002 depth = ibd_fifo_depth; 1003 1004 for (i = 0; i < inst_taskqs; i++) 1005 if (_ddi_srv_fifo_create(&srv_fifos[i], 1006 depth, drain_fifo, 1007 (caddr_t)private) != DDI_SUCCESS) 1008 break; 1009 1010 if (i < inst_taskqs) 1011 goto map_rx_srv_fifos_fail1; 1012 1013 goto map_rx_srv_fifos_exit; 1014 1015 map_rx_srv_fifos_fail1: 1016 i--; 1017 for (; i >= 0; i--) { 1018 _ddi_srv_fifo_destroy(srv_fifos[i]); 1019 } 1020 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1021 srv_fifos = NULL; 1022 1023 map_rx_srv_fifos_exit: 1024 return (srv_fifos); 1025 } 1026 1027 static void 1028 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) 1029 { 1030 int i; 1031 1032 /* 1033 * If this interface was not using service fifos, quickly return. 1034 */ 1035 if (inst_taskqs == 0) 1036 return; 1037 1038 for (i = 0; i < inst_taskqs; i++) { 1039 _ddi_srv_fifo_destroy(srv_fifos[i]); 1040 } 1041 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1042 } 1043 1044 /* 1045 * Choose between sending up the packet directly and handing off 1046 * to a service thread. 1047 */ 1048 static void 1049 ibd_send_up(ibd_state_t *state, mblk_t *mp) 1050 { 1051 p_srv_fifo_t *srvfifo; 1052 ipoib_hdr_t *lhdr; 1053 struct ip *ip_hdr; 1054 struct udphdr *tran_hdr; 1055 uchar_t prot; 1056 int tnum = -1, nfifos = state->id_nfifos; 1057 1058 /* 1059 * Quick path if the interface is not using service fifos. 1060 */ 1061 if (nfifos == 0) { 1062 hand_off: 1063 IBD_CKSUM_RECV(mp); 1064 gld_recv(state->id_macinfo, mp); 1065 return; 1066 } 1067 1068 /* 1069 * Is the packet big enough to look at the IPoIB header 1070 * and basic IP header to determine whether it is an 1071 * IPv4 packet? 1072 */ 1073 if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + 1074 sizeof (struct ip))) { 1075 1076 lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); 1077 1078 /* 1079 * Is the packet an IP(v4) packet? 1080 */ 1081 if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { 1082 1083 ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + 1084 IPOIB_HDRSIZE); 1085 prot = ip_hdr->ip_p; 1086 1087 /* 1088 * TCP or UDP packet? We use the UDP header, since 1089 * the first few words of both headers are laid out 1090 * similarly (src/dest ports). 1091 */ 1092 if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { 1093 1094 tran_hdr = (struct udphdr *)( 1095 (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); 1096 1097 /* 1098 * Are we within limits of this packet? If 1099 * so, use the destination port to hash to 1100 * a service thread. 1101 */ 1102 if (mp->b_wptr >= ((uchar_t *)tran_hdr + 1103 sizeof (*tran_hdr))) 1104 tnum = (ntohs(tran_hdr->uh_dport) + 1105 ntohs(tran_hdr->uh_sport)) % 1106 nfifos; 1107 } 1108 } 1109 } 1110 1111 /* 1112 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the 1113 * packet up in interrupt context, reducing latency. 1114 */ 1115 if (tnum == -1) { 1116 goto hand_off; 1117 } 1118 1119 srvfifo = (p_srv_fifo_t *)state->id_fifos; 1120 if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, 1121 B_TRUE) != DDI_SUCCESS) 1122 freemsg(mp); 1123 } 1124 1125 /* 1126 * Address handle entries maintained by the driver are kept in the 1127 * free and active lists. Each entry starts out in the free list; 1128 * it migrates to the active list when primed using ibt_get_paths() 1129 * and ibt_modify_ud_dest() for transmission to a specific destination. 1130 * In the active list, the entry has a reference count indicating the 1131 * number of ongoing/uncompleted transmits that reference it. The 1132 * entry is left in the active list even after the reference count 1133 * goes to 0, since successive transmits can find it there and do 1134 * not need to set up another entry (ie the path information is 1135 * cached using the active list). Entries on the active list are 1136 * also hashed using the destination link address as a key for faster 1137 * lookups during transmits. 1138 * 1139 * For any destination address (unicast or multicast, whatever the 1140 * join states), there will be at most one entry in the active list. 1141 * Entries with a 0 reference count on the active list can be reused 1142 * for a transmit to a new destination, if the free list is empty. 1143 * 1144 * The AH free list insertion/deletion is protected with the id_ac_mutex, 1145 * since the async thread and Tx callback handlers insert/delete. The 1146 * active list does not need a lock (all operations are done by the 1147 * async thread) but updates to the reference count are atomically 1148 * done (increments done by Tx path, decrements by the Tx callback handler). 1149 */ 1150 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 1151 list_insert_head(&state->id_ah_free, ce) 1152 #define IBD_ACACHE_GET_FREE(state) \ 1153 list_get_head(&state->id_ah_free) 1154 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 1155 int _ret_; \ 1156 list_insert_head(&state->id_ah_active, ce); \ 1157 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 1158 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1159 ASSERT(_ret_ == 0); \ 1160 } 1161 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 1162 list_remove(&state->id_ah_active, ce); \ 1163 (void) mod_hash_remove(state->id_ah_active_hash, \ 1164 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1165 } 1166 #define IBD_ACACHE_GET_ACTIVE(state) \ 1167 list_get_head(&state->id_ah_active) 1168 1169 /* 1170 * Membership states for different mcg's are tracked by two lists: 1171 * the "non" list is used for promiscuous mode, when all mcg traffic 1172 * needs to be inspected. This type of membership is never used for 1173 * transmission, so there can not be an AH in the active list 1174 * corresponding to a member in this list. This list does not need 1175 * any protection, since all operations are performed by the async 1176 * thread. 1177 * 1178 * "Full" and "SendOnly" membership is tracked using a single list, 1179 * the "full" list. This is because this single list can then be 1180 * searched during transmit to a multicast group (if an AH for the 1181 * mcg is not found in the active list), since at least one type 1182 * of membership must be present before initiating the transmit. 1183 * This list is also emptied during driver detach, since sendonly 1184 * membership acquired during transmit is dropped at detach time 1185 * alongwith ipv4 broadcast full membership. Insert/deletes to 1186 * this list are done only by the async thread, but it is also 1187 * searched in program context (see multicast disable case), thus 1188 * the id_mc_mutex protects the list. The driver detach path also 1189 * deconstructs the "full" list, but it ensures that the async 1190 * thread will not be accessing the list (by blocking out mcg 1191 * trap handling and making sure no more Tx reaping will happen). 1192 * 1193 * Currently, an IBA attach is done in the SendOnly case too, 1194 * although this is not required. 1195 */ 1196 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1197 list_insert_head(&state->id_mc_full, mce) 1198 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1199 list_insert_head(&state->id_mc_non, mce) 1200 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1201 ibd_mcache_find(mgid, &state->id_mc_full) 1202 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1203 ibd_mcache_find(mgid, &state->id_mc_non) 1204 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1205 list_remove(&state->id_mc_full, mce) 1206 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1207 list_remove(&state->id_mc_non, mce) 1208 1209 /* 1210 * AH and MCE active list manipulation: 1211 * 1212 * Multicast disable requests and MCG delete traps are two cases 1213 * where the active AH entry for the mcg (if any unreferenced one exists) 1214 * will be moved to the free list (to force the next Tx to the mcg to 1215 * join the MCG in SendOnly mode). Port up handling will also move AHs 1216 * from active to free list. 1217 * 1218 * In the case when some transmits are still pending on an entry 1219 * for an mcg, but a multicast disable has already been issued on the 1220 * mcg, there are some options to consider to preserve the join state 1221 * to ensure the emitted packet is properly routed on the IBA fabric. 1222 * For the AH, we can 1223 * 1. take out of active list at multicast disable time. 1224 * 2. take out of active list only when last pending Tx completes. 1225 * For the MCE, we can 1226 * 3. take out of active list at multicast disable time. 1227 * 4. take out of active list only when last pending Tx completes. 1228 * 5. move from active list to stale list at multicast disable time. 1229 * We choose to use 2,4. We use option 4 so that if a multicast enable 1230 * is tried before the pending Tx completes, the enable code finds the 1231 * mce in the active list and just has to make sure it will not be reaped 1232 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1233 * a stale list (#5) that would be checked in the enable code would need 1234 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1235 * after the multicast disable would try to put an AH in the active list, 1236 * and associate the mce it finds in the active list to this new AH, 1237 * whereas the mce is already associated with the previous AH (taken off 1238 * the active list), and will be removed once the pending Tx's complete 1239 * (unless a reference count on mce's is implemented). One implication of 1240 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1241 * grab new references on the AH, further delaying the leave. 1242 * 1243 * In the case of mcg delete (or create) trap when the port is sendonly 1244 * joined, the AH and MCE handling is different: the AH and MCE has to be 1245 * immediately taken off the active lists (forcing a join and path lookup 1246 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1247 * to an mcg as it is repeatedly created and deleted and goes thru 1248 * reincarnations). 1249 * 1250 * When a port is already sendonly joined, and a multicast enable is 1251 * attempted, the same mce structure is promoted; this ensures only a 1252 * single mce on the active list tracks the most powerful join state. 1253 * 1254 * In the case of port up event handling, the MCE for sendonly membership 1255 * is freed up, and the ACE is put into the free list as soon as possible 1256 * (depending on whether posted Tx's have completed). For fullmembership 1257 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1258 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1259 * done; else the mce is deconstructed (mc_fullreap case). 1260 * 1261 * MCG creation and deletion trap handling: 1262 * 1263 * These traps are unreliable (meaning sometimes the trap might never 1264 * be delivered to the subscribed nodes) and may arrive out-of-order 1265 * since they use UD transport. An alternative to relying on these 1266 * unreliable traps is to poll for mcg presence every so often, but 1267 * instead of doing that, we try to be as conservative as possible 1268 * while handling the traps, and hope that the traps do arrive at 1269 * the subscribed nodes soon. Note that if a node is fullmember 1270 * joined to an mcg, it can not possibly receive a mcg create/delete 1271 * trap for that mcg (by fullmember definition); if it does, it is 1272 * an old trap from a previous incarnation of the mcg. 1273 * 1274 * Whenever a trap is received, the driver cleans up its sendonly 1275 * membership to the group; we choose to do a sendonly leave even 1276 * on a creation trap to handle the case of a prior deletion of the mcg 1277 * having gone unnoticed. Consider an example scenario: 1278 * T1: MCG M is deleted, and fires off deletion trap D1. 1279 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1280 * T3: Node N tries to transmit to M, joining in sendonly mode. 1281 * T4: MCG M is deleted, and fires off deletion trap D2. 1282 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1283 * If the trap is D2, then a LEAVE is not required, since the mcg 1284 * is already deleted; but if it is D1, a LEAVE is required. A safe 1285 * approach is to always LEAVE, but the SM may be confused if it 1286 * receives a LEAVE without a prior JOIN. 1287 * 1288 * Management of the non-membership to an mcg is similar to the above, 1289 * except that if the interface is in promiscuous mode, it is required 1290 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1291 * if the re-join attempt fails (in which case a warning message needs 1292 * to be printed), it is not clear whether it failed due to the mcg not 1293 * existing, or some fabric/hca issues, due to the delayed nature of 1294 * trap delivery. Querying the SA to establish presence/absence of the 1295 * mcg is also racy at best. Thus, the driver just prints a warning 1296 * message when it can not rejoin after receiving a create trap, although 1297 * this might be (on rare occassions) a mis-warning if the create trap is 1298 * received after the mcg was deleted. 1299 */ 1300 1301 /* 1302 * Implementation of atomic "recycle" bits and reference count 1303 * on address handles. This utilizes the fact that max reference 1304 * count on any handle is limited by number of send wqes, thus 1305 * high bits in the ac_ref field can be used as the recycle bits, 1306 * and only the low bits hold the number of pending Tx requests. 1307 * This atomic AH reference counting allows the Tx completion 1308 * handler not to acquire the id_ac_mutex to process every completion, 1309 * thus reducing lock contention problems between completion and 1310 * the Tx path. 1311 */ 1312 #define CYCLEVAL 0x80000 1313 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1314 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1315 #define GET_REF(ace) ((ace)->ac_ref) 1316 #define GET_REF_CYCLE(ace) ( \ 1317 /* \ 1318 * Make sure "cycle" bit is set. \ 1319 */ \ 1320 ASSERT(CYCLE_SET(ace)), \ 1321 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1322 ) 1323 #define INC_REF(ace, num) { \ 1324 atomic_add_32(&(ace)->ac_ref, num); \ 1325 } 1326 #define SET_CYCLE_IF_REF(ace) ( \ 1327 CYCLE_SET(ace) ? B_TRUE : \ 1328 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1329 CYCLEVAL ? \ 1330 /* \ 1331 * Clear the "cycle" bit we just set; \ 1332 * ref count known to be 0 from above. \ 1333 */ \ 1334 CLEAR_REFCYCLE(ace), B_FALSE : \ 1335 /* \ 1336 * We set "cycle" bit; let caller know. \ 1337 */ \ 1338 B_TRUE \ 1339 ) 1340 #define DEC_REF_DO_CYCLE(ace) ( \ 1341 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1342 CYCLEVAL ? \ 1343 /* \ 1344 * Ref count known to be 0 from above. \ 1345 */ \ 1346 B_TRUE : \ 1347 B_FALSE \ 1348 ) 1349 1350 static void * 1351 list_get_head(list_t *list) 1352 { 1353 list_node_t *lhead = list_head(list); 1354 1355 if (lhead != NULL) 1356 list_remove(list, lhead); 1357 return (lhead); 1358 } 1359 1360 /* 1361 * This is always guaranteed to be able to queue the work. 1362 */ 1363 static void 1364 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1365 { 1366 /* Initialize request */ 1367 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1368 ptr->rq_op = op; 1369 1370 /* 1371 * Queue provided slot onto request pool. 1372 */ 1373 mutex_enter(&state->id_acache_req_lock); 1374 list_insert_tail(&state->id_req_list, ptr); 1375 1376 /* Go, fetch, async thread */ 1377 cv_signal(&state->id_acache_req_cv); 1378 mutex_exit(&state->id_acache_req_lock); 1379 } 1380 1381 /* 1382 * Main body of the per interface async thread. 1383 */ 1384 static void 1385 ibd_async_work(ibd_state_t *state) 1386 { 1387 ibd_req_t *ptr; 1388 callb_cpr_t cprinfo; 1389 1390 mutex_enter(&state->id_acache_req_lock); 1391 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1392 callb_generic_cpr, "ibd_async_work"); 1393 for (;;) { 1394 ptr = list_get_head(&state->id_req_list); 1395 if (ptr != NULL) { 1396 mutex_exit(&state->id_acache_req_lock); 1397 1398 /* 1399 * Once we have done the operation, there is no 1400 * guarantee the request slot is going to be valid, 1401 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). 1402 */ 1403 1404 /* Perform the request */ 1405 switch (ptr->rq_op) { 1406 case ASYNC_GETAH: 1407 ibd_async_acache(state, &ptr->rq_mac); 1408 break; 1409 case ASYNC_POKE: 1410 /* 1411 * We need the gld_sched; that 1412 * happens below. No locks are 1413 * needed for the multi_op update. 1414 */ 1415 state->id_multi_op = NOTSTARTED; 1416 break; 1417 case ASYNC_REAP: 1418 ibd_async_reap_group(state, 1419 ptr->rq_ptr, ptr->rq_gid, 1420 IB_MC_JSTATE_FULL); 1421 break; 1422 case ASYNC_LEAVE: 1423 case ASYNC_JOIN: 1424 ibd_async_multicast(state, 1425 ptr->rq_gid, ptr->rq_op); 1426 break; 1427 case ASYNC_PROMON: 1428 ibd_async_setprom(state, B_TRUE); 1429 break; 1430 case ASYNC_PROMOFF: 1431 ibd_async_unsetprom(state, B_TRUE); 1432 break; 1433 case ASYNC_TRAP: 1434 ibd_async_trap(state, ptr); 1435 break; 1436 case ASYNC_SCHED: 1437 ibd_async_txsched(state); 1438 break; 1439 case ASYNC_LINK: 1440 ibd_async_link(state, ptr); 1441 break; 1442 case ASYNC_EXIT: 1443 mutex_enter(&state->id_acache_req_lock); 1444 #ifndef __lock_lint 1445 CALLB_CPR_EXIT(&cprinfo); 1446 #endif /* !__lock_lint */ 1447 _NOTE(NOT_REACHED) 1448 return; 1449 } 1450 1451 /* 1452 * Indicate blocked operation can now be retried. 1453 * Note gld_sched() gets the gld_maclock, 1454 * and the multicast/promiscuous paths 1455 * (ibd_set_multicast(), ibd_set_promiscuous()) 1456 * grab id_acache_req_lock in ibd_queue_work_slot() 1457 * with gld_maclock held, so we must not hold the 1458 * id_acache_req_lock while calling gld_sched to 1459 * prevent deadlock. 1460 */ 1461 gld_sched(state->id_macinfo); 1462 1463 mutex_enter(&state->id_acache_req_lock); 1464 } else { 1465 /* 1466 * Nothing to do: wait till new request arrives. 1467 */ 1468 #ifndef __lock_lint 1469 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1470 cv_wait(&state->id_acache_req_cv, 1471 &state->id_acache_req_lock); 1472 CALLB_CPR_SAFE_END(&cprinfo, 1473 &state->id_acache_req_lock); 1474 #endif /* !_lock_lint */ 1475 } 1476 } 1477 /*NOTREACHED*/ 1478 } 1479 1480 /* 1481 * Return when it is safe to queue requests to the async daemon; primarily 1482 * for subnet trap and async event handling. Disallow requests before the 1483 * daemon is created, and when interface deinitilization starts. 1484 */ 1485 static boolean_t 1486 ibd_async_safe(ibd_state_t *state) 1487 { 1488 mutex_enter(&state->id_trap_lock); 1489 if (state->id_trap_stop) { 1490 mutex_exit(&state->id_trap_lock); 1491 return (B_FALSE); 1492 } 1493 state->id_trap_inprog++; 1494 mutex_exit(&state->id_trap_lock); 1495 return (B_TRUE); 1496 } 1497 1498 /* 1499 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1500 * trap or event handling to complete to kill the async thread and deconstruct 1501 * the mcg/ace list. 1502 */ 1503 static void 1504 ibd_async_done(ibd_state_t *state) 1505 { 1506 mutex_enter(&state->id_trap_lock); 1507 if (--state->id_trap_inprog == 0) 1508 cv_signal(&state->id_trap_cv); 1509 mutex_exit(&state->id_trap_lock); 1510 } 1511 1512 /* 1513 * Hash functions: 1514 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1515 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1516 * These operate on mac addresses input into ibd_send, but there is no 1517 * guarantee on the alignment of the ipoib_mac_t structure. 1518 */ 1519 /*ARGSUSED*/ 1520 static uint_t 1521 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1522 { 1523 ulong_t ptraddr = (ulong_t)key; 1524 uint_t hval; 1525 1526 /* 1527 * If the input address is 4 byte aligned, we can just dereference 1528 * it. This is most common, since IP will send in a 4 byte aligned 1529 * IP header, which implies the 24 byte IPoIB psuedo header will be 1530 * 4 byte aligned too. 1531 */ 1532 if ((ptraddr & 3) == 0) 1533 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1534 1535 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1536 return (hval); 1537 } 1538 1539 static int 1540 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1541 { 1542 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1543 return (0); 1544 else 1545 return (1); 1546 } 1547 1548 /* 1549 * Initialize all the per interface caches and lists; AH cache, 1550 * MCG list etc. 1551 */ 1552 static int 1553 ibd_acache_init(ibd_state_t *state) 1554 { 1555 ibd_ace_t *ce; 1556 int i; 1557 1558 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1559 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1560 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1561 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1562 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1563 offsetof(ibd_ace_t, ac_list)); 1564 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1565 offsetof(ibd_ace_t, ac_list)); 1566 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1567 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1568 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1569 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1570 offsetof(ibd_mce_t, mc_list)); 1571 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1572 offsetof(ibd_mce_t, mc_list)); 1573 list_create(&state->id_req_list, sizeof (ibd_req_t), 1574 offsetof(ibd_req_t, rq_list)); 1575 1576 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1577 IBD_NUM_AH, KM_SLEEP); 1578 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1579 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1580 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1581 ibd_acache_fini(state); 1582 return (DDI_FAILURE); 1583 } else { 1584 CLEAR_REFCYCLE(ce); 1585 ce->ac_mce = NULL; 1586 IBD_ACACHE_INSERT_FREE(state, ce); 1587 } 1588 } 1589 return (DDI_SUCCESS); 1590 } 1591 1592 static void 1593 ibd_acache_fini(ibd_state_t *state) 1594 { 1595 ibd_ace_t *ptr; 1596 1597 mutex_enter(&state->id_ac_mutex); 1598 1599 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1600 ASSERT(GET_REF(ptr) == 0); 1601 (void) ibt_free_ud_dest(ptr->ac_dest); 1602 } 1603 1604 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1605 ASSERT(GET_REF(ptr) == 0); 1606 (void) ibt_free_ud_dest(ptr->ac_dest); 1607 } 1608 1609 list_destroy(&state->id_ah_free); 1610 list_destroy(&state->id_ah_active); 1611 list_destroy(&state->id_mc_full); 1612 list_destroy(&state->id_mc_non); 1613 list_destroy(&state->id_req_list); 1614 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1615 mutex_exit(&state->id_ac_mutex); 1616 mutex_destroy(&state->id_ac_mutex); 1617 mutex_destroy(&state->id_mc_mutex); 1618 mutex_destroy(&state->id_acache_req_lock); 1619 cv_destroy(&state->id_acache_req_cv); 1620 } 1621 1622 /* 1623 * Search AH active hash list for a cached path to input destination. 1624 * If we are "just looking", hold == F. When we are in the Tx path, 1625 * we set hold == T to grab a reference on the AH so that it can not 1626 * be recycled to a new destination while the Tx request is posted. 1627 */ 1628 static ibd_ace_t * 1629 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1630 { 1631 ibd_ace_t *ptr; 1632 1633 ASSERT(mutex_owned(&state->id_ac_mutex)); 1634 1635 /* 1636 * Do hash search. 1637 */ 1638 if (mod_hash_find(state->id_ah_active_hash, 1639 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1640 if (hold) 1641 INC_REF(ptr, num); 1642 return (ptr); 1643 } 1644 return (NULL); 1645 } 1646 1647 /* 1648 * This is called by the tx side; if an initialized AH is found in 1649 * the active list, it is locked down and can be used; if no entry 1650 * is found, an async request is queued to do path resolution. 1651 */ 1652 static ibd_ace_t * 1653 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1654 { 1655 ibd_ace_t *ptr; 1656 1657 /* 1658 * Only attempt to print when we can; in the mdt pattr case, the 1659 * address is not aligned properly. 1660 */ 1661 if (((ulong_t)mac & 3) == 0) 1662 DPRINT(4, 1663 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1664 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1665 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1666 htonl(mac->ipoib_gidsuff[1])); 1667 1668 mutex_enter(&state->id_ac_mutex); 1669 1670 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1671 mutex_exit(&state->id_ac_mutex); 1672 return (ptr); 1673 } 1674 1675 /* 1676 * Implementation of a single outstanding async request; if 1677 * the operation is not started yet, queue a request and move 1678 * to ongoing state. Remember in id_ah_addr for which address 1679 * we are queueing the request, in case we need to flag an error; 1680 * Any further requests, for the same or different address, until 1681 * the operation completes, is sent back to GLD to be retried. 1682 * The async thread will update id_ah_op with an error indication 1683 * or will set it to indicate the next look up can start; either 1684 * way, it will gld_sched() so that all blocked requests come 1685 * back here. 1686 */ 1687 *err = GLD_NORESOURCES; 1688 if (state->id_ah_op == NOTSTARTED) { 1689 /* 1690 * We did not even find the entry; queue a request for it. 1691 */ 1692 bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL); 1693 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH); 1694 state->id_ah_op = ONGOING; 1695 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1696 } else if ((state->id_ah_op != ONGOING) && 1697 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1698 /* 1699 * Check the status of the pathrecord lookup request 1700 * we had queued before. 1701 */ 1702 if (state->id_ah_op == ERRORED) { 1703 *err = GLD_FAILURE; 1704 state->id_ah_error++; 1705 } else { 1706 /* 1707 * ROUTERED case: We need to send to the 1708 * all-router MCG. If we can find the AH for 1709 * the mcg, the Tx will be attempted. If we 1710 * do not find the AH, we return NORESOURCES 1711 * to retry. 1712 */ 1713 ipoib_mac_t routermac; 1714 1715 (void) ibd_get_allroutergroup(state, mac, &routermac); 1716 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1717 numwqe); 1718 } 1719 state->id_ah_op = NOTSTARTED; 1720 } 1721 mutex_exit(&state->id_ac_mutex); 1722 1723 /* 1724 * The PathRecord lookup failed; retry any other blocked 1725 * Tx requests that might have come in between when we 1726 * initiated the path lookup and now that were sent back 1727 * to GLD to implement single outstanding lookup scheme. 1728 */ 1729 if (*err == GLD_FAILURE) 1730 gld_sched(state->id_macinfo); 1731 return (ptr); 1732 } 1733 1734 /* 1735 * Grab a not-currently-in-use AH/PathRecord from the active 1736 * list to recycle to a new destination. Only the async thread 1737 * executes this code. 1738 */ 1739 static ibd_ace_t * 1740 ibd_acache_get_unref(ibd_state_t *state) 1741 { 1742 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1743 1744 ASSERT(mutex_owned(&state->id_ac_mutex)); 1745 1746 /* 1747 * Do plain linear search. 1748 */ 1749 while (ptr != NULL) { 1750 /* 1751 * Note that it is possible that the "cycle" bit 1752 * is set on the AH w/o any reference count. The 1753 * mcg must have been deleted, and the tx cleanup 1754 * just decremented the reference count to 0, but 1755 * hasn't gotten around to grabbing the id_ac_mutex 1756 * to move the AH into the free list. 1757 */ 1758 if (GET_REF(ptr) == 0) { 1759 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1760 break; 1761 } 1762 ptr = list_next(&state->id_ah_active, ptr); 1763 } 1764 return (ptr); 1765 } 1766 1767 /* 1768 * Invoked to clean up AH from active list in case of multicast 1769 * disable and to handle sendonly memberships during mcg traps. 1770 * And for port up processing for multicast and unicast AHs. 1771 * Normally, the AH is taken off the active list, and put into 1772 * the free list to be recycled for a new destination. In case 1773 * Tx requests on the AH have not completed yet, the AH is marked 1774 * for reaping (which will put the AH on the free list) once the Tx's 1775 * complete; in this case, depending on the "force" input, we take 1776 * out the AH from the active list right now, or leave it also for 1777 * the reap operation. Returns TRUE if the AH is taken off the active 1778 * list (and either put into the free list right now, or arranged for 1779 * later), FALSE otherwise. 1780 */ 1781 static boolean_t 1782 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1783 { 1784 ibd_ace_t *acactive; 1785 boolean_t ret = B_TRUE; 1786 1787 ASSERT(mutex_owned(&state->id_ac_mutex)); 1788 1789 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1790 1791 /* 1792 * Note that the AH might already have the cycle bit set 1793 * on it; this might happen if sequences of multicast 1794 * enables and disables are coming so fast, that posted 1795 * Tx's to the mcg have not completed yet, and the cycle 1796 * bit is set successively by each multicast disable. 1797 */ 1798 if (SET_CYCLE_IF_REF(acactive)) { 1799 if (!force) { 1800 /* 1801 * The ace is kept on the active list, further 1802 * Tx's can still grab a reference on it; the 1803 * ace is reaped when all pending Tx's 1804 * referencing the AH complete. 1805 */ 1806 ret = B_FALSE; 1807 } else { 1808 /* 1809 * In the mcg trap case, we always pull the 1810 * AH from the active list. And also the port 1811 * up multi/unicast case. 1812 */ 1813 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1814 acactive->ac_mce = NULL; 1815 } 1816 } else { 1817 /* 1818 * Determined the ref count is 0, thus reclaim 1819 * immediately after pulling out the ace from 1820 * the active list. 1821 */ 1822 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1823 acactive->ac_mce = NULL; 1824 IBD_ACACHE_INSERT_FREE(state, acactive); 1825 } 1826 1827 } 1828 return (ret); 1829 } 1830 1831 /* 1832 * Helper function for async path record lookup. If we are trying to 1833 * Tx to a MCG, check our membership, possibly trying to join the 1834 * group if required. If that fails, try to send the packet to the 1835 * all router group (indicated by the redirect output), pointing 1836 * the input mac address to the router mcg address. 1837 */ 1838 static ibd_mce_t * 1839 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1840 { 1841 ib_gid_t mgid; 1842 ibd_mce_t *mce; 1843 ipoib_mac_t routermac; 1844 1845 *redirect = B_FALSE; 1846 ibd_n2h_gid(mac, &mgid); 1847 1848 /* 1849 * Check the FullMember+SendOnlyNonMember list. 1850 * Since we are the only one who manipulates the 1851 * id_mc_full list, no locks are needed. 1852 */ 1853 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1854 if (mce != NULL) { 1855 DPRINT(4, "ibd_async_mcache : already joined to group"); 1856 return (mce); 1857 } 1858 1859 /* 1860 * Not found; try to join(SendOnlyNonMember) and attach. 1861 */ 1862 DPRINT(4, "ibd_async_mcache : not joined to group"); 1863 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1864 NULL) { 1865 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1866 return (mce); 1867 } 1868 1869 /* 1870 * MCGroup not present; try to join the all-router group. If 1871 * any of the following steps succeed, we will be redirecting 1872 * to the all router group. 1873 */ 1874 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1875 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1876 return (NULL); 1877 *redirect = B_TRUE; 1878 ibd_n2h_gid(&routermac, &mgid); 1879 bcopy(&routermac, mac, IPOIB_ADDRL); 1880 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1881 mgid.gid_prefix, mgid.gid_guid); 1882 1883 /* 1884 * Are we already joined to the router group? 1885 */ 1886 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1887 DPRINT(4, "ibd_async_mcache : using already joined router" 1888 "group\n"); 1889 return (mce); 1890 } 1891 1892 /* 1893 * Can we join(SendOnlyNonMember) the router group? 1894 */ 1895 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1896 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1897 NULL) { 1898 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1899 return (mce); 1900 } 1901 1902 return (NULL); 1903 } 1904 1905 /* 1906 * Async path record lookup code. 1907 */ 1908 static void 1909 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1910 { 1911 ibd_ace_t *ce; 1912 ibd_mce_t *mce = NULL; 1913 ibt_path_attr_t path_attr; 1914 ibt_path_info_t path_info; 1915 ib_gid_t destgid; 1916 int ret = NOTSTARTED; 1917 1918 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1919 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1920 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1921 htonl(mac->ipoib_gidsuff[1])); 1922 1923 /* 1924 * Check whether we are trying to transmit to a MCG. 1925 * In that case, we need to make sure we are a member of 1926 * the MCG. 1927 */ 1928 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1929 boolean_t redirected; 1930 1931 /* 1932 * If we can not find or join the group or even 1933 * redirect, error out. 1934 */ 1935 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1936 NULL) { 1937 state->id_ah_op = ERRORED; 1938 return; 1939 } 1940 1941 /* 1942 * If we got redirected, we need to determine whether 1943 * the AH for the new mcg is in the cache already, and 1944 * not pull it in then; otherwise proceed to get the 1945 * path for the new mcg. There is no guarantee that 1946 * if the AH is currently in the cache, it will still be 1947 * there when we look in ibd_acache_lookup(), but that's 1948 * okay, we will come back here. 1949 */ 1950 if (redirected) { 1951 ret = ROUTERED; 1952 DPRINT(4, "ibd_async_acache : redirected to " 1953 "%08X:%08X:%08X:%08X:%08X", 1954 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1955 htonl(mac->ipoib_gidpref[1]), 1956 htonl(mac->ipoib_gidsuff[0]), 1957 htonl(mac->ipoib_gidsuff[1])); 1958 1959 mutex_enter(&state->id_ac_mutex); 1960 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1961 mutex_exit(&state->id_ac_mutex); 1962 DPRINT(4, "ibd_async_acache : router AH found"); 1963 state->id_ah_op = ROUTERED; 1964 return; 1965 } 1966 mutex_exit(&state->id_ac_mutex); 1967 } 1968 } 1969 1970 /* 1971 * Get an AH from the free list. 1972 */ 1973 mutex_enter(&state->id_ac_mutex); 1974 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1975 /* 1976 * No free ones; try to grab an unreferenced active 1977 * one. Maybe we need to make the active list LRU, 1978 * but that will create more work for Tx callbacks. 1979 * Is there a way of not having to pull out the 1980 * entry from the active list, but just indicate it 1981 * is being recycled? Yes, but that creates one more 1982 * check in the fast lookup path. 1983 */ 1984 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1985 /* 1986 * Pretty serious shortage now. 1987 */ 1988 state->id_ah_op = NOTSTARTED; 1989 mutex_exit(&state->id_ac_mutex); 1990 DPRINT(10, "ibd_async_acache : failed to find AH " 1991 "slot\n"); 1992 return; 1993 } 1994 /* 1995 * We could check whether ac_mce points to a SendOnly 1996 * member and drop that membership now. Or do it lazily 1997 * at detach time. 1998 */ 1999 ce->ac_mce = NULL; 2000 } 2001 mutex_exit(&state->id_ac_mutex); 2002 ASSERT(ce->ac_mce == NULL); 2003 2004 /* 2005 * Update the entry. 2006 */ 2007 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 2008 2009 bzero(&path_info, sizeof (path_info)); 2010 bzero(&path_attr, sizeof (ibt_path_attr_t)); 2011 path_attr.pa_sgid = state->id_sgid; 2012 path_attr.pa_num_dgids = 1; 2013 ibd_n2h_gid(&ce->ac_mac, &destgid); 2014 path_attr.pa_dgids = &destgid; 2015 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2016 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2017 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 2018 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 2019 goto error; 2020 } 2021 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 2022 ntohl(ce->ac_mac.ipoib_qpn), 2023 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 2024 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 2025 goto error; 2026 } 2027 2028 /* 2029 * mce is set whenever an AH is being associated with a 2030 * MCG; this will come in handy when we leave the MCG. The 2031 * lock protects Tx fastpath from scanning the active list. 2032 */ 2033 if (mce != NULL) 2034 ce->ac_mce = mce; 2035 mutex_enter(&state->id_ac_mutex); 2036 IBD_ACACHE_INSERT_ACTIVE(state, ce); 2037 state->id_ah_op = ret; 2038 mutex_exit(&state->id_ac_mutex); 2039 return; 2040 error: 2041 /* 2042 * We might want to drop SendOnly membership here if we 2043 * joined above. The lock protects Tx callbacks inserting 2044 * into the free list. 2045 */ 2046 mutex_enter(&state->id_ac_mutex); 2047 state->id_ah_op = ERRORED; 2048 IBD_ACACHE_INSERT_FREE(state, ce); 2049 mutex_exit(&state->id_ac_mutex); 2050 } 2051 2052 /* 2053 * While restoring port's presence on the subnet on a port up, it is possible 2054 * that the port goes down again. 2055 */ 2056 static void 2057 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 2058 { 2059 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 2060 int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN : 2061 GLD_LINKSTATE_UP; 2062 ibd_mce_t *mce, *pmce; 2063 ibd_ace_t *ace, *pace; 2064 2065 DPRINT(10, "ibd_async_link(): %d", opcode); 2066 2067 /* 2068 * On a link up, revalidate the link speed/width. No point doing 2069 * this on a link down, since we will be unable to do SA operations, 2070 * defaulting to the lowest speed. Also notice that we update our 2071 * notion of speed before calling gld_linkstate(), which will do 2072 * neccesary higher level notifications for speed changes. 2073 */ 2074 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 2075 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2076 state->id_link_speed = ibd_get_portspeed(state); 2077 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2078 } 2079 2080 /* 2081 * Do all the work required to establish our presence on 2082 * the subnet. 2083 */ 2084 if (opcode == IBD_LINK_UP_ABSENT) { 2085 /* 2086 * If in promiscuous mode ... 2087 */ 2088 if (state->id_prom_op == COMPLETED) { 2089 /* 2090 * Drop all nonmembership. 2091 */ 2092 ibd_async_unsetprom(state, B_FALSE); 2093 2094 /* 2095 * Then, try to regain nonmembership to all mcg's. 2096 */ 2097 ibd_async_setprom(state, B_FALSE); 2098 2099 } 2100 2101 /* 2102 * Drop all sendonly membership (which also gets rid of the 2103 * AHs); try to reacquire all full membership. 2104 */ 2105 mce = list_head(&state->id_mc_full); 2106 while ((pmce = mce) != NULL) { 2107 mce = list_next(&state->id_mc_full, mce); 2108 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 2109 ibd_leave_group(state, 2110 pmce->mc_info.mc_adds_vect.av_dgid, 2111 IB_MC_JSTATE_SEND_ONLY_NON); 2112 else 2113 ibd_reacquire_group(state, pmce); 2114 } 2115 2116 /* 2117 * Recycle all active AHs to free list (and if there are 2118 * pending posts, make sure they will go into the free list 2119 * once the Tx's complete). Grab the lock to prevent 2120 * concurrent Tx's as well as Tx cleanups. 2121 */ 2122 mutex_enter(&state->id_ac_mutex); 2123 ace = list_head(&state->id_ah_active); 2124 while ((pace = ace) != NULL) { 2125 boolean_t cycled; 2126 2127 ace = list_next(&state->id_ah_active, ace); 2128 mce = pace->ac_mce; 2129 cycled = ibd_acache_recycle(state, &pace->ac_mac, 2130 B_TRUE); 2131 /* 2132 * If this is for an mcg, it must be for a fullmember, 2133 * since we got rid of send-only members above when 2134 * processing the mce list. 2135 */ 2136 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2137 IB_MC_JSTATE_FULL))); 2138 2139 /* 2140 * Check if the fullmember mce needs to be torn down, 2141 * ie whether the DLPI disable has already been done. 2142 * If so, do some of the work of tx_cleanup, namely 2143 * causing leave (which will fail), detach and 2144 * mce-freeing. tx_cleanup will put the AH into free 2145 * list. The reason to duplicate some of this 2146 * tx_cleanup work is because we want to delete the 2147 * AH right now instead of waiting for tx_cleanup, to 2148 * force subsequent Tx's to reacquire an AH. 2149 */ 2150 if ((mce != NULL) && (mce->mc_fullreap)) 2151 ibd_async_reap_group(state, mce, 2152 mce->mc_info.mc_adds_vect.av_dgid, 2153 mce->mc_jstate); 2154 } 2155 mutex_exit(&state->id_ac_mutex); 2156 } 2157 2158 /* 2159 * Macinfo is guaranteed to exist since driver does ibt_close_hca() 2160 * (which stops further events from being delivered) before 2161 * gld_mac_free(). At this point, it is guaranteed that gld_register 2162 * has already been done. 2163 */ 2164 mutex_enter(&state->id_link_mutex); 2165 state->id_link_state = lstate; 2166 gld_linkstate(state->id_macinfo, lstate); 2167 mutex_exit(&state->id_link_mutex); 2168 2169 /* 2170 * Free the request slot allocated by the event thread. 2171 */ 2172 kmem_free(req, sizeof (ibd_req_t)); 2173 2174 ibd_async_done(state); 2175 } 2176 2177 /* 2178 * When the link is notified up, we need to do a few things, based 2179 * on the port's current p_init_type_reply claiming a reinit has been 2180 * done or not. The reinit steps are: 2181 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2182 * the old Pkey and GID0 are correct. 2183 * 2. Register for mcg traps (already done by ibmf). 2184 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2185 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2186 * 4. Give up all sendonly memberships. 2187 * 5. Acquire all full memberships. 2188 * 6. In promiscuous mode, acquire all non memberships. 2189 * 7. Recycle all AHs to free list. 2190 */ 2191 static void 2192 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2193 { 2194 ibt_hca_portinfo_t *port_infop; 2195 ibt_status_t ibt_status; 2196 uint_t psize, port_infosz; 2197 ibd_link_op_t opcode; 2198 ibd_req_t *req; 2199 2200 /* 2201 * Do not send a request to the async daemon if it has not 2202 * yet been created or is being destroyed. If the async 2203 * daemon has not yet been created, we still need to track 2204 * last known state of the link. If this code races with the 2205 * detach path, then we are assured that the detach path has 2206 * not yet done the ibt_close_hca (which waits for all async 2207 * events to complete). If the code races with the attach path, 2208 * we need to validate the pkey/gid (in the link_up case) if 2209 * the initialization path has already set these up and created 2210 * IBTF resources based on the values. 2211 */ 2212 mutex_enter(&state->id_link_mutex); 2213 2214 /* 2215 * If the init code in ibd_drv_init hasn't yet set up the 2216 * pkey/gid, nothing to do; that code will set the link state. 2217 */ 2218 if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) { 2219 mutex_exit(&state->id_link_mutex); 2220 return; 2221 } 2222 2223 if (code == IBT_EVENT_PORT_UP) { 2224 uint8_t itreply; 2225 boolean_t badup = B_FALSE; 2226 2227 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2228 state->id_port, &port_infop, &psize, &port_infosz); 2229 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2230 mutex_exit(&state->id_link_mutex); 2231 DPRINT(10, "ibd_link_up : failed in" 2232 " ibt_query_port()\n"); 2233 return; 2234 } 2235 2236 /* 2237 * If the link already went down by the time the handler gets 2238 * here, give up; we can not even validate pkey/gid since those 2239 * are not valid. 2240 */ 2241 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2242 badup = B_TRUE; 2243 2244 itreply = port_infop->p_init_type_reply; 2245 2246 /* 2247 * In InitTypeReply, check if NoLoadReply == 2248 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2249 */ 2250 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2251 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2252 (!badup)) { 2253 /* 2254 * Check that the subnet part of GID0 has not changed. 2255 */ 2256 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2257 sizeof (ib_gid_t)) != 0) 2258 badup = B_TRUE; 2259 2260 /* 2261 * Check that Pkey/index mapping is still valid. 2262 */ 2263 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2264 (port_infop->p_pkey_tbl[state->id_pkix] != 2265 state->id_pkey)) 2266 badup = B_TRUE; 2267 } 2268 2269 /* 2270 * In InitTypeReply, if PreservePresenceReply indicates the SM 2271 * has ensured that the port's presence in mcg, traps etc is 2272 * intact, nothing more to do. 2273 */ 2274 opcode = IBD_LINK_UP_ABSENT; 2275 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2276 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2277 opcode = IBD_LINK_UP; 2278 2279 if (badup) 2280 code = IBT_ERROR_PORT_DOWN; 2281 ibt_free_portinfo(port_infop, port_infosz); 2282 } 2283 2284 if (!ibd_async_safe(state)) { 2285 state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? 2286 GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN); 2287 mutex_exit(&state->id_link_mutex); 2288 return; 2289 } 2290 mutex_exit(&state->id_link_mutex); 2291 2292 if (code == IBT_ERROR_PORT_DOWN) 2293 opcode = IBD_LINK_DOWN; 2294 2295 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 2296 req->rq_ptr = (void *)opcode; 2297 ibd_queue_work_slot(state, req, ASYNC_LINK); 2298 } 2299 2300 /* 2301 * For the port up/down events, IBTL guarantees there will not be concurrent 2302 * invocations of the handler. IBTL might coalesce link transition events, 2303 * and not invoke the handler for _each_ up/down transition, but it will 2304 * invoke the handler with last known state 2305 */ 2306 static void 2307 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2308 ibt_async_code_t code, ibt_async_event_t *event) 2309 { 2310 ibd_state_t *state = (ibd_state_t *)clnt_private; 2311 2312 switch (code) { 2313 case IBT_ERROR_CATASTROPHIC_CHAN: 2314 ibd_print_warn(state, "catastrophic channel error"); 2315 break; 2316 case IBT_ERROR_CQ: 2317 ibd_print_warn(state, "completion queue error"); 2318 break; 2319 case IBT_ERROR_PORT_DOWN: 2320 case IBT_EVENT_PORT_UP: 2321 /* 2322 * Events will be delivered to all instances that have 2323 * done ibt_open_hca() but not yet done ibt_close_hca(). 2324 * Only need to do work for our port; IBTF will deliver 2325 * events for other ports on the hca we have ibt_open_hca'ed 2326 * too. Note that ibd_drv_init() initializes id_port before 2327 * doing ibt_open_hca(). 2328 */ 2329 ASSERT(state->id_hca_hdl == hca_hdl); 2330 if (state->id_port != event->ev_port) 2331 break; 2332 2333 ibd_link_mod(state, code); 2334 break; 2335 2336 case IBT_HCA_ATTACH_EVENT: 2337 case IBT_HCA_DETACH_EVENT: 2338 /* 2339 * When a new card is plugged to the system, attach_event is 2340 * invoked. Additionally, a cfgadm needs to be run to make the 2341 * card known to the system, and an ifconfig needs to be run to 2342 * plumb up any ibd interfaces on the card. In the case of card 2343 * unplug, a cfgadm is run that will trigger any RCM scripts to 2344 * unplumb the ibd interfaces on the card; when the card is 2345 * actually unplugged, the detach_event is invoked; 2346 * additionally, if any ibd instances are still active on the 2347 * card (eg there were no associated RCM scripts), driver's 2348 * detach routine is invoked. 2349 */ 2350 break; 2351 default: 2352 break; 2353 } 2354 } 2355 2356 /* 2357 * Attach device to the IO framework. 2358 */ 2359 static int 2360 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2361 { 2362 ibd_state_t *state; 2363 int instance; 2364 2365 switch (cmd) { 2366 case DDI_ATTACH: 2367 break; 2368 case DDI_RESUME: 2369 /* This driver does not support resume */ 2370 default: 2371 return (DDI_FAILURE); 2372 } 2373 2374 /* 2375 * Allocate soft device data structure 2376 */ 2377 instance = ddi_get_instance(dip); 2378 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2379 return (DDI_FAILURE); 2380 state = ddi_get_soft_state(ibd_list, instance); 2381 2382 /* pre ibt_attach() soft state initialization */ 2383 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2384 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2385 goto attach_fail_state_init; 2386 } 2387 2388 /* "attach" to IBTL */ 2389 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2390 &state->id_ibt_hdl) != IBT_SUCCESS) { 2391 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2392 goto attach_fail_ibt_attach; 2393 } 2394 2395 /* Finish initializing this driver */ 2396 if (ibd_drv_init(state) != DDI_SUCCESS) { 2397 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2398 goto attach_fail_drv_init; 2399 } 2400 2401 /* 2402 * Register ourselves with the GLD interface 2403 * 2404 * gld_register will: 2405 * link us with the GLD module; 2406 * set our ddi_set_driver_private(9F) data to the macinfo ptr; 2407 * save the devinfo pointer in macinfo->gldm_devinfo; 2408 * create the minor device node. 2409 */ 2410 if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) { 2411 DPRINT(10, "ibd_attach : failed in gld_register()"); 2412 goto attach_fail_gld_register; 2413 } 2414 2415 /* 2416 * Setup the handler we will use for regular DLPI stuff. Its important 2417 * to setup the recv handler after registering with gld. Setting it 2418 * before causes at times an incoming packet to be forwarded to gld 2419 * before the gld_register. This will result in gld dropping the packet 2420 * which is ignored by ibd_rcq_handler, thus failing to re-arm the 2421 * tavor events. This will cause tavor_isr on recv path to be not 2422 * invoked any further. 2423 */ 2424 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2425 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2426 IBT_SUCCESS) { 2427 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2428 goto attach_fail_gld_register; 2429 } 2430 2431 /* 2432 * Setup the subnet notices handler after we initialize the a/mcaches 2433 * and start the async thread, both of which are required for the 2434 * trap handler to function properly. Enable the trap handler to 2435 * queue requests to the async thread after the gld_register, because 2436 * the async daemon invokes gld_sched(), which must be done after 2437 * gld_register(). 2438 */ 2439 ibt_register_subnet_notices(state->id_ibt_hdl, 2440 ibd_snet_notices_handler, state); 2441 mutex_enter(&state->id_trap_lock); 2442 state->id_trap_stop = B_FALSE; 2443 mutex_exit(&state->id_trap_lock); 2444 2445 /* 2446 * Indicate link status to GLD and higher layers. By default, 2447 * we assume we are in up state (which must have been true at 2448 * least at the time the broadcast mcg's were probed); if there 2449 * were any up/down transitions till the time we come here, the 2450 * async handler will have updated last known state, which we 2451 * use to tell GLD. The async handler will not send any 2452 * notifications to GLD till we reach here in the initialization 2453 * sequence. 2454 */ 2455 mutex_enter(&state->id_link_mutex); 2456 gld_linkstate(state->id_macinfo, state->id_link_state); 2457 mutex_exit(&state->id_link_mutex); 2458 2459 return (DDI_SUCCESS); 2460 2461 /* Attach failure points, cleanup */ 2462 attach_fail_gld_register: 2463 ibd_drv_fini(state); 2464 2465 attach_fail_drv_init: 2466 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2467 ibd_print_warn(state, "failed to free IB resources"); 2468 2469 attach_fail_ibt_attach: 2470 ibd_state_fini(state); 2471 2472 attach_fail_state_init: 2473 ddi_soft_state_free(ibd_list, instance); 2474 2475 return (DDI_FAILURE); 2476 } 2477 2478 /* 2479 * Detach device from the IO framework. 2480 */ 2481 static int 2482 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2483 { 2484 ibd_state_t *state; 2485 int status; 2486 int instance; 2487 2488 switch (cmd) { 2489 case DDI_DETACH: 2490 break; 2491 case DDI_SUSPEND: 2492 default: 2493 return (DDI_FAILURE); 2494 } 2495 2496 instance = ddi_get_instance(dip); 2497 state = ddi_get_soft_state(ibd_list, instance); 2498 2499 /* 2500 * First, stop receive interrupts; this stops the 2501 * driver from handing up buffers to higher layers. 2502 * Wait for receive buffers to be returned; give up 2503 * after 5 seconds. 2504 */ 2505 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2506 status = 50; 2507 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2508 delay(drv_usectohz(100000)); 2509 if (--status == 0) { 2510 DPRINT(2, "ibd_detach : reclaiming failed"); 2511 goto failed; 2512 } 2513 } 2514 2515 if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) { 2516 DPRINT(10, "ibd_detach : failed in gld_unregister()"); 2517 goto failed; 2518 } 2519 2520 ibd_drv_fini(state); 2521 2522 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2523 ibd_print_warn(state, "failed to free all IB resources at " 2524 "driver detach time"); 2525 2526 ibd_state_fini(state); 2527 ddi_soft_state_free(ibd_list, instance); 2528 return (DDI_SUCCESS); 2529 2530 failed: 2531 /* 2532 * Reap all the Tx/Rx completions that were posted since we 2533 * turned off the notification. Turn on notifications. There 2534 * is a race in that we do not reap completions that come in 2535 * after the poll and before notifications get turned on. That 2536 * is okay, the next rx/tx packet will trigger a completion 2537 * that will reap any missed completions. 2538 */ 2539 ibd_poll_compq(state, state->id_rcq_hdl); 2540 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2541 return (DDI_FAILURE); 2542 } 2543 2544 /* 2545 * Pre ibt_attach() driver initialization 2546 */ 2547 static int 2548 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2549 { 2550 gld_mac_info_t *macinfo; 2551 2552 if ((macinfo = gld_mac_alloc(dip)) == NULL) { 2553 DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()"); 2554 return (DDI_FAILURE); 2555 } 2556 2557 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2558 state->id_link_state = GLD_LINKSTATE_UNKNOWN; 2559 2560 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2561 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2562 state->id_trap_stop = B_TRUE; 2563 state->id_trap_inprog = 0; 2564 2565 mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); 2566 state->id_dip = dip; 2567 state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2568 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2569 2570 state->id_sched_queued = B_FALSE; 2571 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2572 2573 state->id_tx_list.dl_head = NULL; 2574 state->id_tx_list.dl_tail = NULL; 2575 state->id_tx_list.dl_pending_sends = B_FALSE; 2576 state->id_tx_list.dl_cnt = 0; 2577 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2578 2579 state->id_rx_list.dl_head = NULL; 2580 state->id_rx_list.dl_tail = NULL; 2581 state->id_rx_list.dl_bufs_outstanding = 0; 2582 state->id_rx_list.dl_cnt = 0; 2583 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2584 2585 /* Link up various structs for later access */ 2586 macinfo->gldm_private = (caddr_t)state; 2587 state->id_macinfo = macinfo; 2588 2589 /* 2590 * Initialize pointers to device specific functions which will be 2591 * used by the generic layer. 2592 */ 2593 macinfo->gldm_reset = ibd_reset; 2594 macinfo->gldm_start = ibd_start; 2595 macinfo->gldm_stop = ibd_stop; 2596 macinfo->gldm_set_mac_addr = ibd_set_mac_addr; 2597 macinfo->gldm_set_multicast = ibd_set_multicast; 2598 macinfo->gldm_set_promiscuous = ibd_set_promiscuous; 2599 macinfo->gldm_get_stats = ibd_get_stats; 2600 macinfo->gldm_send = ibd_send; 2601 macinfo->gldm_intr = ibd_intr; 2602 macinfo->gldm_mdt_pre = ibd_mdt_pre; 2603 macinfo->gldm_mdt_send = ibd_mdt_txone; 2604 macinfo->gldm_mdt_post = ibd_mdt_post; 2605 macinfo->gldm_mdt_sgl = state->id_max_sqseg; 2606 macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS; 2607 2608 /* Initialize board characteristics needed by the generic layer. */ 2609 macinfo->gldm_ident = "InfiniBand DLPI Driver"; 2610 macinfo->gldm_type = DL_IB; 2611 macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */ 2612 macinfo->gldm_addrlen = IPOIB_ADDRL; 2613 macinfo->gldm_saplen = -2; 2614 macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; 2615 2616 /* Other required initialization */ 2617 macinfo->gldm_ppa = ddi_get_instance(dip); 2618 macinfo->gldm_devinfo = dip; 2619 2620 return (DDI_SUCCESS); 2621 } 2622 2623 /* 2624 * Post ibt_detach() driver deconstruction 2625 */ 2626 static void 2627 ibd_state_fini(ibd_state_t *state) 2628 { 2629 mutex_destroy(&state->id_tx_list.dl_mutex); 2630 mutex_destroy(&state->id_rx_list.dl_mutex); 2631 mutex_destroy(&state->id_sched_lock); 2632 mutex_destroy(&state->id_txcomp_lock); 2633 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2634 kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2635 cv_destroy(&state->id_trap_cv); 2636 mutex_destroy(&state->id_trap_lock); 2637 mutex_destroy(&state->id_link_mutex); 2638 gld_mac_free(state->id_macinfo); 2639 } 2640 2641 /* 2642 * Fetch IBA parameters for the network device from IB nexus. 2643 */ 2644 static int 2645 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2646 { 2647 /* 2648 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2649 * Note that the default partition is also allowed. 2650 */ 2651 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2652 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2653 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2654 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2655 "partition\n"); 2656 return (DDI_FAILURE); 2657 } 2658 2659 /* 2660 * ... the IBA port ... 2661 */ 2662 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2663 0, "port-number", 0); 2664 if (state->id_port == 0) { 2665 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2666 return (DDI_FAILURE); 2667 } 2668 2669 /* 2670 * ... and HCA GUID. 2671 */ 2672 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2673 0, "hca-guid", 0); 2674 if (*hca_guid == 0) { 2675 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2676 "guid\n"); 2677 return (DDI_FAILURE); 2678 } 2679 2680 return (DDI_SUCCESS); 2681 } 2682 2683 /* 2684 * Fetch link speed from SA for snmp ifspeed reporting. 2685 */ 2686 static uint64_t 2687 ibd_get_portspeed(ibd_state_t *state) 2688 { 2689 int ret; 2690 uint64_t ifspeed; 2691 size_t length; 2692 ib_lid_t lid; 2693 sa_portinfo_record_t req, *resp = NULL; 2694 ibmf_saa_access_args_t args; 2695 ibmf_saa_handle_t saa_handle; 2696 2697 /* 2698 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2699 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2700 * 2000000000. Start with that as default. 2701 */ 2702 ifspeed = 2000000000; 2703 2704 /* Get port lid */ 2705 if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL, 2706 &lid) != IBT_SUCCESS) 2707 goto earlydone; 2708 2709 if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL, 2710 IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS) 2711 goto earlydone; 2712 2713 /* Contact SA Access */ 2714 bzero(&req, sizeof (sa_portinfo_record_t)); 2715 req.EndportLID = lid; 2716 2717 args.sq_attr_id = SA_PORTINFORECORD_ATTRID; 2718 args.sq_access_type = IBMF_SAA_RETRIEVE; 2719 args.sq_component_mask = SA_PORTINFO_COMPMASK_PORTLID; 2720 args.sq_template = &req; 2721 args.sq_callback = NULL; 2722 args.sq_callback_arg = NULL; 2723 2724 ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp); 2725 if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL)) 2726 goto done; 2727 2728 /* 2729 * 4X/12X needs appropriate multipliers. With IBA 1.2 additions, 2730 * double and quad multipliers are also needed per LinkSpeedEnabled. 2731 * In case SA does not return an expected value, report the default 2732 * speed as 1X. 2733 */ 2734 ret = 1; 2735 switch (resp->PortInfo.LinkWidthActive) { 2736 case SM_LINK_WIDTH_ACTIVE_1X: 2737 ret = 1; 2738 break; 2739 case SM_LINK_WIDTH_ACTIVE_4X: 2740 ret = 4; 2741 break; 2742 case SM_LINK_WIDTH_ACTIVE_12X: 2743 ret = 12; 2744 break; 2745 } 2746 ifspeed *= ret; 2747 kmem_free(resp, length); 2748 2749 done: 2750 (void) ibmf_sa_session_close(&saa_handle, 0); 2751 2752 earlydone: 2753 return (ifspeed); 2754 } 2755 2756 /* 2757 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2758 * representing the input mcg mgid. 2759 */ 2760 static ibd_mce_t * 2761 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2762 { 2763 ibd_mce_t *ptr = list_head(mlist); 2764 2765 /* 2766 * Do plain linear search. 2767 */ 2768 while (ptr != NULL) { 2769 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2770 sizeof (ib_gid_t)) == 0) 2771 return (ptr); 2772 ptr = list_next(mlist, ptr); 2773 } 2774 return (NULL); 2775 } 2776 2777 /* 2778 * Execute IBA JOIN. 2779 */ 2780 static ibt_status_t 2781 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2782 { 2783 ibt_mcg_attr_t mcg_attr; 2784 2785 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2786 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2787 mcg_attr.mc_mgid = mgid; 2788 mcg_attr.mc_join_state = mce->mc_jstate; 2789 mcg_attr.mc_scope = state->id_scope; 2790 mcg_attr.mc_pkey = state->id_pkey; 2791 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2792 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2793 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2794 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2795 NULL, NULL)); 2796 } 2797 2798 /* 2799 * This code JOINs the port in the proper way (depending on the join 2800 * state) so that IBA fabric will forward mcg packets to/from the port. 2801 * It also attaches the QPN to the mcg so it can receive those mcg 2802 * packets. This code makes sure not to attach the mcg to the QP if 2803 * that has been previously done due to the mcg being joined with a 2804 * different join state, even though this is not required by SWG_0216, 2805 * refid 3610. 2806 */ 2807 static ibd_mce_t * 2808 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2809 { 2810 ibt_status_t ibt_status; 2811 ibd_mce_t *mce, *tmce, *omce = NULL; 2812 boolean_t do_attach = B_TRUE; 2813 2814 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2815 jstate, mgid.gid_prefix, mgid.gid_guid); 2816 2817 /* 2818 * For enable_multicast Full member joins, we need to do some 2819 * extra work. If there is already an mce on the list that 2820 * indicates full membership, that means the membership has 2821 * not yet been dropped (since the disable_multicast was issued) 2822 * because there are pending Tx's to the mcg; in that case, just 2823 * mark the mce not to be reaped when the Tx completion queues 2824 * an async reap operation. 2825 * 2826 * If there is already an mce on the list indicating sendonly 2827 * membership, try to promote to full membership. Be careful 2828 * not to deallocate the old mce, since there might be an AH 2829 * pointing to it; instead, update the old mce with new data 2830 * that tracks the full membership. 2831 */ 2832 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2833 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2834 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2835 ASSERT(omce->mc_fullreap); 2836 omce->mc_fullreap = B_FALSE; 2837 return (omce); 2838 } else { 2839 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2840 } 2841 } 2842 2843 /* 2844 * Allocate the ibd_mce_t to track this JOIN. 2845 */ 2846 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2847 mce->mc_fullreap = B_FALSE; 2848 mce->mc_jstate = jstate; 2849 2850 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2851 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2852 ibt_status); 2853 kmem_free(mce, sizeof (ibd_mce_t)); 2854 return (NULL); 2855 } 2856 2857 /* 2858 * Is an IBA attach required? Not if the interface is already joined 2859 * to the mcg in a different appropriate join state. 2860 */ 2861 if (jstate == IB_MC_JSTATE_NON) { 2862 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2863 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2864 do_attach = B_FALSE; 2865 } else if (jstate == IB_MC_JSTATE_FULL) { 2866 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2867 do_attach = B_FALSE; 2868 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2869 do_attach = B_FALSE; 2870 } 2871 2872 if (do_attach) { 2873 /* 2874 * Do the IBA attach. 2875 */ 2876 DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); 2877 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2878 &mce->mc_info)) != IBT_SUCCESS) { 2879 DPRINT(10, "ibd_join_group : failed qp attachment " 2880 "%d\n", ibt_status); 2881 /* 2882 * NOTE that we should probably preserve the join info 2883 * in the list and later try to leave again at detach 2884 * time. 2885 */ 2886 (void) ibt_leave_mcg(state->id_sgid, mgid, 2887 state->id_sgid, jstate); 2888 kmem_free(mce, sizeof (ibd_mce_t)); 2889 return (NULL); 2890 } 2891 } 2892 2893 /* 2894 * Insert the ibd_mce_t in the proper list. 2895 */ 2896 if (jstate == IB_MC_JSTATE_NON) { 2897 IBD_MCACHE_INSERT_NON(state, mce); 2898 } else { 2899 /* 2900 * Set up the mc_req fields used for reaping the 2901 * mcg in case of delayed tx completion (see 2902 * ibd_tx_cleanup()). Also done for sendonly join in 2903 * case we are promoted to fullmembership later and 2904 * keep using the same mce. 2905 */ 2906 mce->mc_req.rq_gid = mgid; 2907 mce->mc_req.rq_ptr = mce; 2908 /* 2909 * Check whether this is the case of trying to join 2910 * full member, and we were already joined send only. 2911 * We try to drop our SendOnly membership, but it is 2912 * possible that the mcg does not exist anymore (and 2913 * the subnet trap never reached us), so the leave 2914 * operation might fail. 2915 */ 2916 if (omce != NULL) { 2917 (void) ibt_leave_mcg(state->id_sgid, mgid, 2918 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2919 omce->mc_jstate = IB_MC_JSTATE_FULL; 2920 bcopy(&mce->mc_info, &omce->mc_info, 2921 sizeof (ibt_mcg_info_t)); 2922 kmem_free(mce, sizeof (ibd_mce_t)); 2923 return (omce); 2924 } 2925 mutex_enter(&state->id_mc_mutex); 2926 IBD_MCACHE_INSERT_FULL(state, mce); 2927 mutex_exit(&state->id_mc_mutex); 2928 } 2929 2930 return (mce); 2931 } 2932 2933 /* 2934 * Called during port up event handling to attempt to reacquire full 2935 * membership to an mcg. Stripped down version of ibd_join_group(). 2936 * Note that it is possible that the mcg might have gone away, and 2937 * gets recreated at this point. 2938 */ 2939 static void 2940 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2941 { 2942 ib_gid_t mgid; 2943 2944 /* 2945 * If the mc_fullreap flag is set, or this join fails, a subsequent 2946 * reap/leave is going to try to leave the group. We could prevent 2947 * that by adding a boolean flag into ibd_mce_t, if required. 2948 */ 2949 if (mce->mc_fullreap) 2950 return; 2951 2952 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2953 2954 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2955 mgid.gid_guid); 2956 2957 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2958 ibd_print_warn(state, "Failure on port up to rejoin " 2959 "multicast gid %016llx:%016llx", 2960 (u_longlong_t)mgid.gid_prefix, 2961 (u_longlong_t)mgid.gid_guid); 2962 } 2963 2964 /* 2965 * This code handles delayed Tx completion cleanups for mcg's to which 2966 * disable_multicast has been issued, regular mcg related cleanups during 2967 * disable_multicast, disable_promiscous and mcg traps, as well as 2968 * cleanups during driver detach time. Depending on the join state, 2969 * it deletes the mce from the appropriate list and issues the IBA 2970 * leave/detach; except in the disable_multicast case when the mce 2971 * is left on the active list for a subsequent Tx completion cleanup. 2972 */ 2973 static void 2974 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2975 uint8_t jstate) 2976 { 2977 ibd_mce_t *tmce; 2978 boolean_t do_detach = B_TRUE; 2979 2980 /* 2981 * Before detaching, we must check whether the other list 2982 * contains the mcg; if we detach blindly, the consumer 2983 * who set up the other list will also stop receiving 2984 * traffic. 2985 */ 2986 if (jstate == IB_MC_JSTATE_FULL) { 2987 /* 2988 * The following check is only relevant while coming 2989 * from the Tx completion path in the reap case. 2990 */ 2991 if (!mce->mc_fullreap) 2992 return; 2993 mutex_enter(&state->id_mc_mutex); 2994 IBD_MCACHE_PULLOUT_FULL(state, mce); 2995 mutex_exit(&state->id_mc_mutex); 2996 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2997 do_detach = B_FALSE; 2998 } else if (jstate == IB_MC_JSTATE_NON) { 2999 IBD_MCACHE_PULLOUT_NON(state, mce); 3000 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3001 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3002 do_detach = B_FALSE; 3003 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3004 mutex_enter(&state->id_mc_mutex); 3005 IBD_MCACHE_PULLOUT_FULL(state, mce); 3006 mutex_exit(&state->id_mc_mutex); 3007 do_detach = B_FALSE; 3008 } 3009 3010 /* 3011 * If we are reacting to a mcg trap and leaving our sendonly or 3012 * non membership, the mcg is possibly already gone, so attempting 3013 * to leave might fail. On the other hand, we must try to leave 3014 * anyway, since this might be a trap from long ago, and we could 3015 * have potentially sendonly joined to a recent incarnation of 3016 * the mcg and are about to loose track of this information. 3017 */ 3018 if (do_detach) { 3019 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3020 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3021 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3022 } 3023 3024 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3025 kmem_free(mce, sizeof (ibd_mce_t)); 3026 } 3027 3028 /* 3029 * Async code executed due to multicast and promiscuous disable requests 3030 * and mcg trap handling; also executed during driver detach. Mostly, a 3031 * leave and detach is done; except for the fullmember case when Tx 3032 * requests are pending, whence arrangements are made for subsequent 3033 * cleanup on Tx completion. 3034 */ 3035 static void 3036 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3037 { 3038 ipoib_mac_t mcmac; 3039 boolean_t recycled; 3040 ibd_mce_t *mce; 3041 3042 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3043 jstate, mgid.gid_prefix, mgid.gid_guid); 3044 3045 if (jstate == IB_MC_JSTATE_NON) { 3046 recycled = B_TRUE; 3047 mce = IBD_MCACHE_FIND_NON(state, mgid); 3048 /* 3049 * In case we are handling a mcg trap, we might not find 3050 * the mcg in the non list. 3051 */ 3052 if (mce == NULL) 3053 return; 3054 } else { 3055 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3056 3057 /* 3058 * In case we are handling a mcg trap, make sure the trap 3059 * is not arriving late; if we have an mce that indicates 3060 * that we are already a fullmember, that would be a clear 3061 * indication that the trap arrived late (ie, is for a 3062 * previous incarnation of the mcg). 3063 */ 3064 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3065 if ((mce == NULL) || (mce->mc_jstate == 3066 IB_MC_JSTATE_FULL)) 3067 return; 3068 ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3069 } else { 3070 ASSERT(jstate == IB_MC_JSTATE_FULL); 3071 ASSERT((mce != NULL) && (mce->mc_jstate == 3072 IB_MC_JSTATE_FULL)); 3073 mce->mc_fullreap = B_TRUE; 3074 } 3075 3076 /* 3077 * If no pending Tx's remain that reference the AH 3078 * for the mcg, recycle it from active to free list. 3079 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3080 * so the last completing Tx will cause an async reap 3081 * operation to be invoked, at which time we will drop our 3082 * membership to the mcg so that the pending Tx's complete 3083 * successfully. Refer to comments on "AH and MCE active 3084 * list manipulation" at top of this file. The lock protects 3085 * against Tx fast path and Tx cleanup code. 3086 */ 3087 mutex_enter(&state->id_ac_mutex); 3088 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3089 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3090 IB_MC_JSTATE_SEND_ONLY_NON)); 3091 mutex_exit(&state->id_ac_mutex); 3092 } 3093 3094 if (recycled) { 3095 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3096 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3097 ibd_async_reap_group(state, mce, mgid, jstate); 3098 } 3099 } 3100 3101 /* 3102 * Find the broadcast address as defined by IPoIB; implicitly 3103 * determines the IBA scope, mtu, tclass etc of the link the 3104 * interface is going to be a member of. 3105 */ 3106 static ibt_status_t 3107 ibd_find_bgroup(ibd_state_t *state) 3108 { 3109 ibt_mcg_attr_t mcg_attr; 3110 uint_t numg; 3111 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3112 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3113 IB_MC_SCOPE_GLOBAL }; 3114 int i, mcgmtu; 3115 boolean_t found = B_FALSE; 3116 3117 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3118 mcg_attr.mc_pkey = state->id_pkey; 3119 state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; 3120 3121 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3122 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3123 3124 /* 3125 * Look for the IPoIB broadcast group. 3126 */ 3127 state->id_mgid.gid_prefix = 3128 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3129 ((uint64_t)state->id_scope << 48) | 3130 ((uint32_t)(state->id_pkey << 16))); 3131 mcg_attr.mc_mgid = state->id_mgid; 3132 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3133 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3134 found = B_TRUE; 3135 break; 3136 } 3137 3138 } 3139 3140 if (!found) { 3141 ibd_print_warn(state, "IPoIB broadcast group absent"); 3142 return (IBT_FAILURE); 3143 } 3144 3145 /* 3146 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3147 */ 3148 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3149 if (state->id_mtu < mcgmtu) { 3150 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3151 "greater than port's maximum MTU %d", mcgmtu, 3152 state->id_mtu); 3153 return (IBT_FAILURE); 3154 } 3155 state->id_mtu = mcgmtu; 3156 3157 return (IBT_SUCCESS); 3158 } 3159 3160 /* 3161 * Post ibt_attach() initialization. 3162 */ 3163 static int 3164 ibd_drv_init(ibd_state_t *state) 3165 { 3166 kthread_t *kht; 3167 ibt_ud_chan_alloc_args_t ud_alloc_attr; 3168 ibt_ud_chan_query_attr_t ud_chan_attr; 3169 ibt_hca_portinfo_t *port_infop; 3170 ibt_hca_attr_t hca_attrs; 3171 ibt_status_t ibt_status; 3172 ibt_cq_attr_t cq_attr; 3173 ib_guid_t hca_guid; 3174 uint32_t real_size; 3175 uint32_t *ptr; 3176 char pathname[OBP_MAXPATHLEN]; 3177 uint_t psize, port_infosz; 3178 3179 /* 3180 * Initialize id_port before ibt_open_hca because of 3181 * ordering requirements in port up/down handling. 3182 */ 3183 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 3184 return (DDI_FAILURE); 3185 3186 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 3187 &state->id_hca_hdl) != IBT_SUCCESS) { 3188 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 3189 return (DDI_FAILURE); 3190 } 3191 3192 mutex_enter(&state->id_link_mutex); 3193 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 3194 state->id_port, &port_infop, &psize, 3195 &port_infosz); 3196 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 3197 mutex_exit(&state->id_link_mutex); 3198 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 3199 (void) ibt_close_hca(state->id_hca_hdl); 3200 return (DDI_FAILURE); 3201 } 3202 3203 /* 3204 * If the link already went down by the time we get here, give up; 3205 * we can not even get the gid since that is not valid. We would 3206 * fail in ibd_find_bgroup() anyway. 3207 */ 3208 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3209 mutex_exit(&state->id_link_mutex); 3210 ibt_free_portinfo(port_infop, port_infosz); 3211 (void) ibt_close_hca(state->id_hca_hdl); 3212 ibd_print_warn(state, "Port is not active"); 3213 return (DDI_FAILURE); 3214 } 3215 3216 /* 3217 * This verifies the Pkey ibnexus handed us is still valid. 3218 * This is also the point from which the pkey table for the 3219 * port must hold the exact pkey value at the exact index 3220 * across port up/downs. 3221 */ 3222 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3223 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3224 mutex_exit(&state->id_link_mutex); 3225 ibt_free_portinfo(port_infop, port_infosz); 3226 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3227 (void) ibt_close_hca(state->id_hca_hdl); 3228 return (DDI_FAILURE); 3229 } 3230 3231 state->id_mtu = (128 << port_infop->p_mtu); 3232 state->id_sgid = *port_infop->p_sgid_tbl; 3233 state->id_link_state = GLD_LINKSTATE_UP; 3234 mutex_exit(&state->id_link_mutex); 3235 3236 ibt_free_portinfo(port_infop, port_infosz); 3237 3238 state->id_link_speed = ibd_get_portspeed(state); 3239 3240 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3241 ASSERT(ibt_status == IBT_SUCCESS); 3242 3243 /* 3244 * We need to determine whether the HCA can support checksum 3245 * and indicate that to higher layers. 3246 */ 3247 if (ibd_csum_send > IBD_CSUM_NONE) 3248 state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL; 3249 3250 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3251 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3252 goto drv_init_fail_find_bgroup; 3253 } 3254 state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE; 3255 3256 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3257 &state->id_pd_hdl) != IBT_SUCCESS) { 3258 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3259 goto drv_init_fail_alloc_pd; 3260 } 3261 3262 /* Initialize the parallel ARP cache and AHs */ 3263 if (ibd_acache_init(state) != DDI_SUCCESS) { 3264 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3265 goto drv_init_fail_acache; 3266 } 3267 3268 /* 3269 * Check various tunable limits. 3270 */ 3271 if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { 3272 ibd_print_warn(state, "Setting #sgl = %d instead of default %d", 3273 hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); 3274 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3275 } else { 3276 state->id_max_sqseg = IBD_MAX_SQSEG; 3277 } 3278 3279 /* 3280 * First, check #r/s wqes against max channel size. 3281 */ 3282 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3283 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3284 else 3285 state->id_num_rwqe = IBD_NUM_RWQE; 3286 3287 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3288 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3289 else 3290 state->id_num_swqe = IBD_NUM_SWQE; 3291 3292 /* 3293 * Allocate Rx/combined CQ: 3294 * Theoretically, there is no point in having more than #rwqe 3295 * plus #swqe cqe's, except that the CQ will be signalled for 3296 * overflow when the last wqe completes, if none of the previous 3297 * cqe's have been polled. Thus, we allocate just a few less wqe's 3298 * to make sure such overflow does not occur. 3299 */ 3300 cq_attr.cq_sched = NULL; 3301 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3302 3303 if (ibd_separate_cqs == 1) { 3304 /* 3305 * Allocate Receive CQ. 3306 */ 3307 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3308 cq_attr.cq_size = state->id_num_rwqe + 1; 3309 } else { 3310 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3311 state->id_num_rwqe = cq_attr.cq_size - 1; 3312 } 3313 3314 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3315 ibd_print_warn(state, "Computed #rwqe %d based on " 3316 "requested size and supportable CQ size is less " 3317 "than the required threshold %d", 3318 state->id_num_rwqe, IBD_RX_THRESHOLD); 3319 goto drv_init_fail_min_rwqes; 3320 } 3321 3322 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3323 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3324 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3325 goto drv_init_fail_alloc_rcq; 3326 } 3327 3328 /* 3329 * Allocate Send CQ. 3330 */ 3331 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3332 cq_attr.cq_size = state->id_num_swqe + 1; 3333 } else { 3334 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3335 state->id_num_swqe = cq_attr.cq_size - 1; 3336 } 3337 3338 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3339 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3340 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3341 goto drv_init_fail_alloc_scq; 3342 } 3343 } else { 3344 /* 3345 * Allocate combined Send/Receive CQ. 3346 */ 3347 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3348 state->id_num_swqe + 1)) { 3349 cq_attr.cq_size = state->id_num_rwqe + 3350 state->id_num_swqe + 1; 3351 } else { 3352 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3353 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3354 state->id_num_rwqe) / (state->id_num_rwqe + 3355 state->id_num_swqe); 3356 state->id_num_swqe = cq_attr.cq_size - 1 - 3357 state->id_num_rwqe; 3358 } 3359 3360 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3361 ibd_print_warn(state, "Computed #rwqe %d based on " 3362 "requested size and supportable CQ size is less " 3363 "than the required threshold %d", 3364 state->id_num_rwqe, IBD_RX_THRESHOLD); 3365 goto drv_init_fail_min_rwqes; 3366 } 3367 3368 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3369 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3370 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3371 goto drv_init_fail_alloc_rcq; 3372 } 3373 state->id_scq_hdl = state->id_rcq_hdl; 3374 } 3375 3376 /* 3377 * Print message in case we could not allocate as many wqe's 3378 * as was requested. Note that in the combined CQ case, we will 3379 * get the following message. 3380 */ 3381 if (state->id_num_rwqe != IBD_NUM_RWQE) 3382 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3383 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3384 if (state->id_num_swqe != IBD_NUM_SWQE) 3385 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3386 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3387 3388 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3389 ud_alloc_attr.ud_hca_port_num = state->id_port; 3390 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3391 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3392 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3393 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3394 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3395 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3396 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3397 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3398 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3399 ud_alloc_attr.ud_clone_chan = NULL; 3400 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3401 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3402 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3403 "\n"); 3404 goto drv_init_fail_alloc_chan; 3405 } 3406 3407 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3408 DDI_SUCCESS) { 3409 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3410 goto drv_init_fail_query_chan; 3411 } 3412 state->id_qpnum = ud_chan_attr.ud_qpn; 3413 3414 /* Initialize the Transmit buffer list */ 3415 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3416 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3417 goto drv_init_fail_txlist_init; 3418 } 3419 3420 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3421 /* Setup the handler we will use for regular DLPI stuff */ 3422 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3423 if (ibt_enable_cq_notify(state->id_scq_hdl, 3424 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3425 DPRINT(10, "ibd_drv_init : failed in" 3426 " ibt_enable_cq_notify()\n"); 3427 goto drv_init_fail_cq_notify; 3428 } 3429 } 3430 3431 /* Create the service fifos before we start receiving */ 3432 if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, 3433 state)) == NULL) { 3434 DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); 3435 goto drv_init_fail_srv_fifo; 3436 } 3437 3438 /* Initialize the Receive buffer list */ 3439 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3440 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3441 goto drv_init_fail_rxlist_init; 3442 } 3443 3444 /* Join to IPoIB broadcast group as required by IPoIB */ 3445 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3446 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3447 goto drv_init_fail_join_group; 3448 } 3449 3450 /* Create the async thread */ 3451 if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3452 TS_RUN, minclsyspri)) == NULL) { 3453 /* Do we have to specially leave the group? */ 3454 DPRINT(10, "ibd_drv_init : failed in thread_create\n"); 3455 goto drv_init_fail_thread_create; 3456 } 3457 state->id_async_thrid = kht->t_did; 3458 3459 /* 3460 * The local mac address is now known. Create the IPoIB 3461 * address. 3462 */ 3463 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3464 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3465 state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr; 3466 3467 /* 3468 * Similarly, program in the broadcast mac address. 3469 */ 3470 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3471 state->id_mgid.gid_guid); 3472 state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr; 3473 3474 ptr = (uint32_t *)&state->id_macaddr; 3475 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3476 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3477 ptr = (uint32_t *)&state->id_bcaddr; 3478 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3479 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3480 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3481 state->id_pkey, state->id_mgid.gid_prefix, 3482 state->id_mgid.gid_guid); 3483 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3484 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3485 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3486 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3487 (void) ddi_pathname(state->id_dip, pathname); 3488 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3489 3490 return (DDI_SUCCESS); 3491 3492 drv_init_fail_thread_create: 3493 ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); 3494 3495 drv_init_fail_join_group: 3496 ibd_fini_rxlist(state); 3497 3498 drv_init_fail_rxlist_init: 3499 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3500 3501 drv_init_fail_srv_fifo: 3502 drv_init_fail_cq_notify: 3503 ibd_fini_txlist(state); 3504 3505 drv_init_fail_txlist_init: 3506 drv_init_fail_query_chan: 3507 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3508 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3509 3510 drv_init_fail_alloc_chan: 3511 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3512 IBT_SUCCESS)) 3513 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3514 3515 drv_init_fail_alloc_scq: 3516 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3517 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3518 3519 drv_init_fail_min_rwqes: 3520 drv_init_fail_alloc_rcq: 3521 ibd_acache_fini(state); 3522 drv_init_fail_acache: 3523 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3524 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3525 3526 drv_init_fail_alloc_pd: 3527 ibt_free_mcg_info(state->id_mcinfo, 1); 3528 drv_init_fail_find_bgroup: 3529 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3530 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3531 3532 return (DDI_FAILURE); 3533 } 3534 3535 /* 3536 * Allocate the statically allocated Tx buffer list. 3537 */ 3538 static int 3539 ibd_init_txlist(ibd_state_t *state) 3540 { 3541 ibd_swqe_t *swqe; 3542 int i; 3543 3544 for (i = 0; i < state->id_num_swqe; i++) { 3545 if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { 3546 DPRINT(10, "ibd_init_txlist : failed in " 3547 "ibd_alloc_swqe()\n"); 3548 ibd_fini_txlist(state); 3549 return (DDI_FAILURE); 3550 } 3551 3552 /* add to list */ 3553 state->id_tx_list.dl_cnt++; 3554 if (state->id_tx_list.dl_head == NULL) { 3555 swqe->swqe_prev = NULL; 3556 swqe->swqe_next = NULL; 3557 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3558 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3559 } else { 3560 swqe->swqe_prev = state->id_tx_list.dl_tail; 3561 swqe->swqe_next = NULL; 3562 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3563 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3564 } 3565 } 3566 3567 return (DDI_SUCCESS); 3568 } 3569 3570 /* 3571 * Free the statically allocated Tx buffer list. 3572 */ 3573 static void 3574 ibd_fini_txlist(ibd_state_t *state) 3575 { 3576 ibd_swqe_t *node; 3577 3578 mutex_enter(&state->id_tx_list.dl_mutex); 3579 while (state->id_tx_list.dl_head != NULL) { 3580 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3581 state->id_tx_list.dl_head = node->swqe_next; 3582 state->id_tx_list.dl_cnt--; 3583 ASSERT(state->id_tx_list.dl_cnt >= 0); 3584 ibd_free_swqe(state, node); 3585 } 3586 mutex_exit(&state->id_tx_list.dl_mutex); 3587 } 3588 3589 /* 3590 * Allocate a single send wqe and register it so it is almost 3591 * ready to be posted to the hardware. 3592 */ 3593 static int 3594 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) 3595 { 3596 ibt_mr_attr_t mem_attr; 3597 ibd_swqe_t *swqe; 3598 3599 swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); 3600 *wqe = swqe; 3601 swqe->swqe_type = IBD_WQE_SEND; 3602 swqe->swqe_next = NULL; 3603 swqe->swqe_prev = NULL; 3604 swqe->swqe_im_mblk = NULL; 3605 swqe->w_mdtinfo = NULL; 3606 3607 /* alloc copy buffer, must be max size to handle multiple mblk case */ 3608 swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); 3609 3610 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3611 mem_attr.mr_len = state->id_mtu; 3612 mem_attr.mr_as = NULL; 3613 mem_attr.mr_flags = IBT_MR_SLEEP; 3614 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3615 &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != 3616 IBT_SUCCESS) { 3617 DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); 3618 kmem_free(swqe->swqe_copybuf.ic_bufaddr, 3619 state->id_mtu); 3620 kmem_free(swqe, sizeof (ibd_swqe_t)); 3621 return (DDI_FAILURE); 3622 } 3623 3624 swqe->swqe_copybuf.ic_sgl.ds_va = 3625 (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3626 swqe->swqe_copybuf.ic_sgl.ds_key = 3627 swqe->swqe_copybuf.ic_mr_desc.md_lkey; 3628 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3629 3630 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3631 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3632 swqe->w_swr.wr_trans = IBT_UD_SRV; 3633 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3634 3635 /* These are set in send */ 3636 swqe->w_swr.wr_nds = 0; 3637 swqe->w_swr.wr_sgl = NULL; 3638 3639 return (DDI_SUCCESS); 3640 } 3641 3642 /* 3643 * Free an allocated send wqe. 3644 */ 3645 static void 3646 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3647 { 3648 3649 if (ibt_deregister_mr(state->id_hca_hdl, 3650 swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3651 DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); 3652 return; 3653 } 3654 kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); 3655 kmem_free(swqe, sizeof (ibd_swqe_t)); 3656 } 3657 3658 /* 3659 * Post a rwqe to the hardware and add it to the Rx list. The 3660 * "recycle" parameter indicates whether an old rwqe is being 3661 * recycled, or this is a new one. 3662 */ 3663 static int 3664 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3665 { 3666 if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != 3667 IBT_SUCCESS) { 3668 DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); 3669 return (DDI_FAILURE); 3670 } 3671 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3672 3673 /* 3674 * Buffers being recycled are already in the list. 3675 */ 3676 if (recycle) 3677 return (DDI_SUCCESS); 3678 3679 mutex_enter(&state->id_rx_list.dl_mutex); 3680 if (state->id_rx_list.dl_head == NULL) { 3681 rwqe->rwqe_prev = NULL; 3682 rwqe->rwqe_next = NULL; 3683 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3684 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3685 } else { 3686 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3687 rwqe->rwqe_next = NULL; 3688 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3689 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3690 } 3691 mutex_exit(&state->id_rx_list.dl_mutex); 3692 3693 return (DDI_SUCCESS); 3694 } 3695 3696 /* 3697 * Allocate the statically allocated Rx buffer list. 3698 */ 3699 static int 3700 ibd_init_rxlist(ibd_state_t *state) 3701 { 3702 ibd_rwqe_t *rwqe; 3703 int i; 3704 3705 for (i = 0; i < state->id_num_rwqe; i++) { 3706 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3707 ibd_fini_rxlist(state); 3708 return (DDI_FAILURE); 3709 } 3710 3711 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3712 ibd_free_rwqe(state, rwqe); 3713 ibd_fini_rxlist(state); 3714 return (DDI_FAILURE); 3715 } 3716 } 3717 3718 return (DDI_SUCCESS); 3719 } 3720 3721 /* 3722 * Free the statically allocated Rx buffer list. 3723 * 3724 */ 3725 static void 3726 ibd_fini_rxlist(ibd_state_t *state) 3727 { 3728 ibd_rwqe_t *node; 3729 3730 mutex_enter(&state->id_rx_list.dl_mutex); 3731 while (state->id_rx_list.dl_head != NULL) { 3732 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3733 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3734 state->id_rx_list.dl_cnt--; 3735 ASSERT(state->id_rx_list.dl_cnt >= 0); 3736 3737 ibd_free_rwqe(state, node); 3738 } 3739 mutex_exit(&state->id_rx_list.dl_mutex); 3740 } 3741 3742 /* 3743 * Allocate a single recv wqe and register it so it is almost 3744 * ready to be posted to the hardware. 3745 */ 3746 static int 3747 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3748 { 3749 ibt_mr_attr_t mem_attr; 3750 ibd_rwqe_t *rwqe; 3751 3752 if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3753 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3754 return (DDI_FAILURE); 3755 } 3756 *wqe = rwqe; 3757 rwqe->rwqe_type = IBD_WQE_RECV; 3758 rwqe->w_state = state; 3759 rwqe->rwqe_next = NULL; 3760 rwqe->rwqe_prev = NULL; 3761 rwqe->w_freeing_wqe = B_FALSE; 3762 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3763 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3764 3765 if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3766 IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { 3767 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); 3768 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3769 return (DDI_FAILURE); 3770 } 3771 3772 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3773 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3774 NULL) { 3775 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3776 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3777 state->id_mtu + IPOIB_GRH_SIZE); 3778 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3779 return (DDI_FAILURE); 3780 } 3781 3782 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3783 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3784 mem_attr.mr_as = NULL; 3785 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3786 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3787 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3788 IBT_SUCCESS) { 3789 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3790 rwqe->w_freeing_wqe = B_TRUE; 3791 freemsg(rwqe->rwqe_im_mblk); 3792 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3793 state->id_mtu + IPOIB_GRH_SIZE); 3794 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3795 return (DDI_FAILURE); 3796 } 3797 3798 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3799 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3800 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3801 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3802 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3803 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3804 rwqe->w_rwr.wr_nds = 1; 3805 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3806 3807 return (DDI_SUCCESS); 3808 } 3809 3810 /* 3811 * Free an allocated recv wqe. 3812 */ 3813 static void 3814 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3815 { 3816 3817 if (ibt_deregister_mr(state->id_hca_hdl, 3818 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3819 DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); 3820 return; 3821 } 3822 3823 /* 3824 * Indicate to the callback function that this rwqe/mblk 3825 * should not be recycled. The freemsg() will invoke 3826 * ibd_freemsg_cb(). 3827 */ 3828 if (rwqe->rwqe_im_mblk != NULL) { 3829 rwqe->w_freeing_wqe = B_TRUE; 3830 freemsg(rwqe->rwqe_im_mblk); 3831 } 3832 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3833 state->id_mtu + IPOIB_GRH_SIZE); 3834 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3835 } 3836 3837 /* 3838 * Delete the rwqe being freed from the rx list. 3839 */ 3840 static void 3841 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3842 { 3843 mutex_enter(&state->id_rx_list.dl_mutex); 3844 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3845 state->id_rx_list.dl_head = rwqe->rwqe_next; 3846 else 3847 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3848 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3849 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3850 else 3851 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3852 mutex_exit(&state->id_rx_list.dl_mutex); 3853 } 3854 3855 /* 3856 * Pre ibt_detach() deconstruction. 3857 */ 3858 static void 3859 ibd_drv_fini(ibd_state_t *state) 3860 { 3861 ib_gid_t mgid; 3862 ibd_mce_t *mce; 3863 ibt_status_t status; 3864 uint8_t jstate; 3865 3866 /* 3867 * Desubscribe from trap notices; we will be tearing down 3868 * the mcg lists soon. Make sure the trap handler does nothing 3869 * even if it is invoked (ie till we invoke ibt_detach()). 3870 */ 3871 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 3872 mutex_enter(&state->id_trap_lock); 3873 state->id_trap_stop = B_TRUE; 3874 while (state->id_trap_inprog > 0) 3875 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 3876 mutex_exit(&state->id_trap_lock); 3877 3878 /* 3879 * Flushing the channel ensures that all pending WQE's 3880 * are marked with flush_error and handed to the CQ. It 3881 * does not guarantee the invocation of the CQ handler. 3882 * This call is guaranteed to return successfully for UD QPNs. 3883 */ 3884 status = ibt_flush_channel(state->id_chnl_hdl); 3885 ASSERT(status == IBT_SUCCESS); 3886 3887 /* 3888 * We possibly need a loop here to wait for all the Tx 3889 * callbacks to happen. The Tx handlers will retrieve 3890 * held resources like AH ac_ref count, registered memory 3891 * and possibly ASYNC_REAP requests. Rx interrupts were already 3892 * turned off (in ibd_detach()); turn off Tx interrupts and 3893 * poll. By the time the polling returns an empty indicator, 3894 * we are sure we have seen all pending Tx callbacks. Note 3895 * that after the ibt_set_cq_handler() returns, the old handler 3896 * is guaranteed not to be invoked anymore. 3897 */ 3898 if (ibd_separate_cqs == 1) 3899 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 3900 ibd_poll_compq(state, state->id_scq_hdl); 3901 3902 /* 3903 * No more async requests will be posted since the device has been 3904 * unregistered; completion handlers have been turned off, so Tx 3905 * handler will not cause any more ASYNC_REAP requests. Queue a 3906 * request for the async thread to exit, which will be serviced 3907 * after any pending ones. This can take a while, specially if the 3908 * SM is unreachable, since IBMF will slowly timeout each SM request 3909 * issued by the async thread. Reap the thread before continuing on, 3910 * we do not want it to be lingering in modunloaded code. 3911 */ 3912 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); 3913 thread_join(state->id_async_thrid); 3914 3915 /* 3916 * We can not be in promiscuous mode anymore, upper layers 3917 * would have made a request to disable it (if ever set previously) 3918 * before the detach is allowed to progress to this point; and the 3919 * aysnc thread would have processed that request by now. Thus the 3920 * nonmember list is guaranteed empty at this point. 3921 */ 3922 ASSERT(state->id_prom_op != COMPLETED); 3923 3924 /* 3925 * Drop all residual full/non membership. This includes full 3926 * membership to the broadcast group, and any nonmembership 3927 * acquired during transmits. We do this after the Tx completion 3928 * handlers are done, since those might result in some late 3929 * leaves; this also eliminates a potential race with that 3930 * path wrt the mc full list insert/delete. Trap handling 3931 * has also been suppressed at this point. Thus, no locks 3932 * are required while traversing the mc full list. 3933 */ 3934 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 3935 mce = list_head(&state->id_mc_full); 3936 while (mce != NULL) { 3937 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3938 jstate = mce->mc_jstate; 3939 mce = list_next(&state->id_mc_full, mce); 3940 ibd_leave_group(state, mgid, jstate); 3941 } 3942 3943 ibt_free_mcg_info(state->id_mcinfo, 1); 3944 3945 /* 3946 * Kill the channel now; guaranteed to return successfully 3947 * for UD QPNs. 3948 */ 3949 status = ibt_free_channel(state->id_chnl_hdl); 3950 ASSERT(status == IBT_SUCCESS); 3951 3952 /* 3953 * Kill the CQ; all completion handlers are guaranteed to 3954 * have terminated by the time this returns. Since we killed 3955 * the QPN above, we can not receive the IBT_CQ_BUSY error. 3956 */ 3957 status = ibt_free_cq(state->id_rcq_hdl); 3958 ASSERT(status == IBT_SUCCESS); 3959 3960 if (ibd_separate_cqs == 1) { 3961 status = ibt_free_cq(state->id_scq_hdl); 3962 ASSERT(status == IBT_SUCCESS); 3963 } 3964 3965 /* 3966 * We killed the receive interrupts, thus, we will not be 3967 * required to handle received packets anymore. Thus, kill 3968 * service threads since they are not going to be used anymore. 3969 */ 3970 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3971 3972 /* 3973 * Since these following will act on the Rx/Tx list, which 3974 * is also looked at by the Rx/Tx handlers, keep them around 3975 * till all handlers are guaranteed to have completed. 3976 */ 3977 ibd_fini_rxlist(state); 3978 ibd_fini_txlist(state); 3979 3980 /* 3981 * Clean up the active AH hash list. 3982 */ 3983 mod_hash_destroy_hash(state->id_ah_active_hash); 3984 3985 /* 3986 * Free parallel ARP cache and AHs; we are sure all of these 3987 * resources have been released by the Tx completion handler. 3988 */ 3989 ibd_acache_fini(state); 3990 3991 /* 3992 * We freed the QPN, all the MRs and AHs. This step should not 3993 * fail; print a warning message if it does fail, due to a bug 3994 * in the driver. 3995 */ 3996 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3997 ibd_print_warn(state, "failed to free protection domain"); 3998 3999 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 4000 ibd_print_warn(state, "failed to close HCA device"); 4001 } 4002 4003 /* 4004 * IBA Rx/Tx completion queue handler. Guaranteed to be single 4005 * threaded and nonreentrant for this CQ. When using combined CQ, 4006 * this handles Tx and Rx completions. With separate CQs, this handles 4007 * only Rx completions. 4008 */ 4009 /* ARGSUSED */ 4010 static void 4011 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4012 { 4013 ibd_state_t *state = (ibd_state_t *)arg; 4014 4015 atomic_add_64(&state->id_num_intrs, 1); 4016 (void) gld_intr(state->id_macinfo); 4017 } 4018 4019 /* 4020 * Separate CQ handler for Tx completions, when the Tx CQ is in 4021 * interrupt driven mode. 4022 */ 4023 /* ARGSUSED */ 4024 static void 4025 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4026 { 4027 ibd_state_t *state = (ibd_state_t *)arg; 4028 4029 atomic_add_64(&state->id_num_intrs, 1); 4030 4031 /* 4032 * Poll for completed entries; the CQ will not interrupt any 4033 * more for completed packets. 4034 */ 4035 ibd_poll_compq(state, state->id_scq_hdl); 4036 4037 /* 4038 * Now enable CQ notifications; all completions originating now 4039 * will cause new interrupts. 4040 */ 4041 if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != 4042 IBT_SUCCESS) { 4043 /* 4044 * We do not expect a failure here. 4045 */ 4046 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 4047 } 4048 4049 /* 4050 * Repoll to catch all packets that might have completed after 4051 * we finished the first poll loop and before interrupts got 4052 * armed. 4053 */ 4054 ibd_poll_compq(state, state->id_scq_hdl); 4055 } 4056 4057 /* 4058 * Multicast group create/delete trap handler. These will be delivered 4059 * on a kernel thread (handling can thus block) and can be invoked 4060 * concurrently. The handler can be invoked anytime after it is 4061 * registered and before ibt_detach(). 4062 */ 4063 /* ARGSUSED */ 4064 static void 4065 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4066 ibt_subnet_event_t *event) 4067 { 4068 ibd_state_t *state = (ibd_state_t *)arg; 4069 ibd_req_t *req; 4070 4071 /* 4072 * The trap handler will get invoked once for every event for 4073 * evert port. The input "gid" is the GID0 of the port the 4074 * trap came in on; we just need to act on traps that came 4075 * to our port, meaning the port on which the ipoib interface 4076 * resides. Since ipoib uses GID0 of the port, we just match 4077 * the gids to check whether we need to handle the trap. 4078 */ 4079 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4080 return; 4081 4082 DPRINT(10, "ibd_notices_handler : %d\n", code); 4083 4084 switch (code) { 4085 case IBT_SM_EVENT_UNAVAILABLE: 4086 /* 4087 * If we are in promiscuous mode or have 4088 * sendnonmembers, we need to print a warning 4089 * message right now. Else, just store the 4090 * information, print when we enter promiscuous 4091 * mode or attempt nonmember send. We might 4092 * also want to stop caching sendnonmember. 4093 */ 4094 ibd_print_warn(state, "IBA multicast support " 4095 "degraded due to unavailability of multicast " 4096 "traps"); 4097 break; 4098 case IBT_SM_EVENT_AVAILABLE: 4099 /* 4100 * If we printed a warning message above or 4101 * while trying to nonmember send or get into 4102 * promiscuous mode, print an okay message. 4103 */ 4104 ibd_print_warn(state, "IBA multicast support " 4105 "restored due to availability of multicast " 4106 "traps"); 4107 break; 4108 case IBT_SM_EVENT_MCG_CREATED: 4109 case IBT_SM_EVENT_MCG_DELETED: 4110 /* 4111 * Common processing of creation/deletion traps. 4112 * First check if the instance is being 4113 * [de]initialized; back off then, without doing 4114 * anything more, since we are not sure if the 4115 * async thread is around, or whether we might 4116 * be racing with the detach code in ibd_drv_fini() 4117 * that scans the mcg list. 4118 */ 4119 if (!ibd_async_safe(state)) 4120 return; 4121 4122 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 4123 req->rq_gid = event->sm_notice_gid; 4124 req->rq_ptr = (void *)code; 4125 ibd_queue_work_slot(state, req, ASYNC_TRAP); 4126 break; 4127 } 4128 } 4129 4130 static void 4131 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4132 { 4133 ib_gid_t mgid = req->rq_gid; 4134 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4135 4136 DPRINT(10, "ibd_async_trap : %d\n", code); 4137 4138 /* 4139 * Atomically search the nonmember and sendonlymember lists and 4140 * delete. 4141 */ 4142 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4143 4144 if (state->id_prom_op == COMPLETED) { 4145 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4146 4147 /* 4148 * If in promiscuous mode, try to join/attach to the new 4149 * mcg. Given the unreliable out-of-order mode of trap 4150 * delivery, we can never be sure whether it is a problem 4151 * if the join fails. Thus, we warn the admin of a failure 4152 * if this was a creation trap. Note that the trap might 4153 * actually be reporting a long past event, and the mcg 4154 * might already have been deleted, thus we might be warning 4155 * in vain. 4156 */ 4157 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4158 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4159 ibd_print_warn(state, "IBA promiscuous mode missed " 4160 "new multicast gid %016llx:%016llx", 4161 (u_longlong_t)mgid.gid_prefix, 4162 (u_longlong_t)mgid.gid_guid); 4163 } 4164 4165 /* 4166 * Free the request slot allocated by the subnet event thread. 4167 */ 4168 kmem_free(req, sizeof (ibd_req_t)); 4169 4170 ibd_async_done(state); 4171 } 4172 4173 /* 4174 * GLD entry point to reset hardware. 4175 */ 4176 /* ARGSUSED */ 4177 static int 4178 ibd_reset(gld_mac_info_t *macinfo) 4179 { 4180 /* 4181 * This will be invoked from Style 1 open() and Style 2 4182 * attach() routines, ie just before the interface starts 4183 * getting used. 4184 */ 4185 return (GLD_SUCCESS); 4186 } 4187 4188 /* 4189 * GLD entry point to start hardware. 4190 */ 4191 /* ARGSUSED */ 4192 static int 4193 ibd_start(gld_mac_info_t *macinfo) 4194 { 4195 return (GLD_SUCCESS); 4196 } 4197 4198 /* 4199 * GLD entry point to stop hardware from receiving packets. 4200 */ 4201 /* ARGSUSED */ 4202 static int 4203 ibd_stop(gld_mac_info_t *macinfo) 4204 { 4205 #ifdef RUN_PERFORMANCE 4206 ibd_perf((ibd_state_t *)macinfo->gldm_private); 4207 #endif 4208 return (GLD_SUCCESS); 4209 } 4210 4211 /* 4212 * GLD entry point to modify device's mac address. We do not 4213 * allow address modifications. 4214 */ 4215 static int 4216 ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr) 4217 { 4218 ibd_state_t *state; 4219 4220 state = (ibd_state_t *)macinfo->gldm_private; 4221 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4222 return (GLD_SUCCESS); 4223 else 4224 return (GLD_FAILURE); 4225 } 4226 4227 /* 4228 * The blocking part of the IBA join/leave operations are done out 4229 * of here on the async thread. 4230 */ 4231 static void 4232 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4233 { 4234 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4235 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4236 4237 if (op == ASYNC_JOIN) { 4238 int ret = ERRORED; 4239 4240 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL) 4241 ret = COMPLETED; 4242 4243 state->id_multi_op = ret; 4244 } else { 4245 /* 4246 * Here, we must search for the proper mcg_info and 4247 * use that to leave the group. 4248 */ 4249 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4250 } 4251 } 4252 4253 /* 4254 * GLD entry point for multicast enable/disable requests. 4255 * Invoked by GLD only on the first multicast enable for a specific 4256 * address (GLD is free to retry ocassionally if we return RETRY), 4257 * and on last disable of the same address. Just queue the operation 4258 * to the async thread. 4259 */ 4260 static int 4261 ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) 4262 { 4263 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4264 ipoib_mac_t *mcast; 4265 ib_gid_t mgid; 4266 ib_qpn_t mcqpn; 4267 int ret; 4268 4269 /* 4270 * The incoming multicast address might not be aligned properly 4271 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4272 * it to look like one though, to get the offsets of the mc gid, 4273 * since we know we are not going to dereference any values with 4274 * the ipoib_mac_t pointer. 4275 */ 4276 mcast = (ipoib_mac_t *)mcmac; 4277 4278 /* 4279 * Check validity of MCG address. We could additionally check 4280 * that a enable/disable is not being issued on the "broadcast" 4281 * mcg, but since this operation is only invokable by priviledged 4282 * programs anyway, we allow the flexibility to those dlpi apps. 4283 * Note that we do not validate the "scope" of the IBA mcg. 4284 */ 4285 bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t)); 4286 if (mcqpn != htonl(IB_MC_QPN)) 4287 return (GLD_FAILURE); 4288 4289 /* 4290 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4291 * nothing (ie we stay JOINed to the broadcast group done in 4292 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4293 * requires to be joined to broadcast groups at all times. 4294 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4295 * depends on this. 4296 */ 4297 if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr, 4298 IPOIB_ADDRL) == 0) 4299 return (GLD_SUCCESS); 4300 4301 ibd_n2h_gid(mcast, &mgid); 4302 4303 if (op == GLD_MULTI_ENABLE) { 4304 DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n", 4305 mgid.gid_prefix, mgid.gid_guid); 4306 ret = GLD_RETRY; 4307 mutex_enter(&state->id_mc_mutex); 4308 if (state->id_multi_op == NOTSTARTED) { 4309 state->id_multi_req.rq_gid = mgid; 4310 ibd_queue_work_slot(state, &state->id_multi_req, 4311 ASYNC_JOIN); 4312 state->id_multi_op = ONGOING; 4313 bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL); 4314 } else if (bcmp(&state->id_multi_addr, mcast, 4315 IPOIB_ADDRL) == 0) { 4316 if (state->id_multi_op != ONGOING) { 4317 if (state->id_multi_op == COMPLETED) 4318 ret = GLD_SUCCESS; 4319 else if (state->id_multi_op == ERRORED) 4320 ret = GLD_FAILURE; 4321 if (state->id_multi_queued) { 4322 state->id_multi_queued = B_FALSE; 4323 ibd_queue_work_slot(state, 4324 &state->id_multi_req, ASYNC_POKE); 4325 } else { 4326 state->id_multi_op = NOTSTARTED; 4327 } 4328 } 4329 } else { 4330 /* 4331 * Hmmm, a set was tried on another mcg. We 4332 * need to make sure to gld_sched for this 4333 * stream to retry once the ongoing one terminates. 4334 * The gld_sched out of the async thread on completion 4335 * of the mcg join is not enough; because the queued 4336 * stream might come in and get a RETRY again because 4337 * the mcg join result has still not been reaped by 4338 * the originator. If gld_sched ensured that streams 4339 * get tried in the order they received RETRYs, things 4340 * would be simpler. 4341 */ 4342 state->id_multi_queued = B_TRUE; 4343 } 4344 mutex_exit(&state->id_mc_mutex); 4345 } else { 4346 ibd_mce_t *mce; 4347 DPRINT(1, "ibd_set_multicast : unset_multicast : " 4348 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4349 ret = GLD_SUCCESS; 4350 mutex_enter(&state->id_mc_mutex); 4351 mce = IBD_MCACHE_FIND_FULL(state, mgid); 4352 mutex_exit(&state->id_mc_mutex); 4353 /* 4354 * GLD should not have invoked us unless the mcg was 4355 * added in the past. 4356 */ 4357 ASSERT(mce != NULL); 4358 ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0); 4359 ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE); 4360 } 4361 return (ret); 4362 } 4363 4364 /* 4365 * The blocking part of the IBA promiscuous operations are done 4366 * out of here on the async thread. The dlpireq parameter indicates 4367 * whether this invocation is due to a dlpi request or due to 4368 * a port up/down event. 4369 */ 4370 static void 4371 ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq) 4372 { 4373 ibd_mce_t *mce = list_head(&state->id_mc_non); 4374 ib_gid_t mgid; 4375 4376 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4377 4378 /* 4379 * Mark the request slot as empty and reusable for the 4380 * next promiscuous set request. 4381 */ 4382 if (dlpireq) 4383 state->id_prom_op = NOTSTARTED; 4384 4385 while (mce != NULL) { 4386 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4387 mce = list_next(&state->id_mc_non, mce); 4388 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4389 } 4390 } 4391 4392 /* 4393 * The blocking part of the IBA promiscuous operations are done 4394 * out of here on the async thread. The dlpireq parameter indicates 4395 * whether this invocation is due to a dlpi request or due to 4396 * a port up/down event. 4397 */ 4398 static void 4399 ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) 4400 { 4401 ibt_mcg_attr_t mcg_attr; 4402 ibt_mcg_info_t *mcg_info; 4403 ib_gid_t mgid; 4404 uint_t numg; 4405 int i; 4406 4407 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4408 4409 /* 4410 * Obtain all active MC groups on the IB fabric with 4411 * specified criteria (scope + Pkey + Qkey + mtu). 4412 */ 4413 bzero(&mcg_attr, sizeof (mcg_attr)); 4414 mcg_attr.mc_pkey = state->id_pkey; 4415 mcg_attr.mc_scope = state->id_scope; 4416 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4417 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4418 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4419 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4420 IBT_SUCCESS) { 4421 ibd_print_warn(state, "Could not get list of IBA multicast " 4422 "groups"); 4423 if (dlpireq) 4424 state->id_prom_op = ERRORED; 4425 return; 4426 } 4427 4428 /* 4429 * Iterate over the returned mcg's and join as NonMember 4430 * to the IP mcg's. 4431 */ 4432 for (i = 0; i < numg; i++) { 4433 /* 4434 * Do a NonMember JOIN on the MC group. 4435 */ 4436 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4437 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4438 ibd_print_warn(state, "IBA promiscuous mode missed " 4439 "multicast gid %016llx:%016llx", 4440 (u_longlong_t)mgid.gid_prefix, 4441 (u_longlong_t)mgid.gid_guid); 4442 } 4443 4444 ibt_free_mcg_info(mcg_info, numg); 4445 if (dlpireq) 4446 state->id_prom_op = COMPLETED; 4447 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4448 } 4449 4450 /* 4451 * GLD entry point for multicast promiscuous enable/disable requests. 4452 * GLD assumes phys state receives more packets than multi state, 4453 * which is not true for IPoIB. Thus, treat the multi and phys 4454 * promiscuous states the same way to work with GLD's assumption. 4455 */ 4456 static int 4457 ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode) 4458 { 4459 ibd_state_t *state; 4460 int ret; 4461 4462 state = (ibd_state_t *)macinfo->gldm_private; 4463 switch (mode) { 4464 case GLD_MAC_PROMISC_PHYS: 4465 case GLD_MAC_PROMISC_MULTI: 4466 DPRINT(1, "ibd_set_promiscuous : set_promisc : %d", 4467 mode); 4468 /* 4469 * Look at gld: this might be getting 4470 * called because someone is turning off 4471 * prom_phys. Nothing needs to be done in 4472 * that case. 4473 */ 4474 ret = GLD_RETRY; 4475 mutex_enter(&state->id_mc_mutex); 4476 switch (state->id_prom_op) { 4477 case NOTSTARTED: 4478 ibd_queue_work_slot(state, 4479 &state->id_prom_req, ASYNC_PROMON); 4480 state->id_prom_op = ONGOING; 4481 break; 4482 case COMPLETED: 4483 ret = GLD_SUCCESS; 4484 break; 4485 case ERRORED: 4486 state->id_prom_op = NOTSTARTED; 4487 ret = GLD_FAILURE; 4488 } 4489 /* 4490 * Else in the ONGOING case, nothing special 4491 * needs to be done; the async thread will poke 4492 * all streams. A prior set, or the last unset 4493 * request is still in the async queue. 4494 */ 4495 mutex_exit(&state->id_mc_mutex); 4496 return (ret); 4497 case GLD_MAC_PROMISC_NONE: 4498 DPRINT(1, "ibd_set_promiscuous : unset_promisc"); 4499 /* 4500 * Look at gld: this might be getting 4501 * called because someone is turning off 4502 * prom_phys or prom_multi. Mark operation 4503 * as ongoing, to prevent a subsequent set 4504 * operation from using the request slot 4505 * unless the async thread is ready to give 4506 * it up. The async thread will mark the 4507 * request slot as usable as soon as it 4508 * starts doing the unset operation. 4509 */ 4510 ASSERT(state->id_prom_op == COMPLETED); 4511 state->id_prom_op = ONGOING; 4512 ibd_queue_work_slot(state, &state->id_prom_req, 4513 ASYNC_PROMOFF); 4514 return (GLD_SUCCESS); 4515 default: 4516 return (GLD_NOTSUPPORTED); 4517 } 4518 } 4519 4520 /* 4521 * GLD entry point for gathering statistics. 4522 */ 4523 static int 4524 ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp) 4525 { 4526 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4527 4528 sp->glds_errrcv = 0; 4529 sp->glds_underflow = 0; 4530 sp->glds_missed = 0; 4531 4532 sp->glds_overflow = state->id_tx_short; /* Tx overflow */ 4533 sp->glds_speed = state->id_link_speed; 4534 sp->glds_media = GLDM_IB; 4535 sp->glds_errxmt = state->id_ah_error; /* failed AH translation */ 4536 sp->glds_norcvbuf = state->id_rx_short; /* # times below water mark */ 4537 sp->glds_intr = state->id_num_intrs; /* number of intrs */ 4538 4539 return (GLD_SUCCESS); 4540 } 4541 4542 /* 4543 * Arrange for a Tx request that is failing, or has already failed due to 4544 * Tx descriptor shortage to be retried soon. Used mostly with poll based 4545 * Tx completion, since gld_sched() can not be invoked in ibd_send() context 4546 * due to potential single processor deadlock (when the ibd_send() is 4547 * caused by gld_recv()). 4548 */ 4549 static void 4550 ibd_tx_sched(ibd_state_t *state) 4551 { 4552 mutex_enter(&state->id_sched_lock); 4553 /* 4554 * If a sched request is already enqueued, do not try to do 4555 * that again, since the async work request list would get 4556 * corrupted. 4557 */ 4558 if (!state->id_sched_queued) { 4559 state->id_sched_queued = B_TRUE; 4560 ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED); 4561 } 4562 mutex_exit(&state->id_sched_lock); 4563 } 4564 4565 /* 4566 * The gld_sched() in ibd_async_work() does the work for us. 4567 */ 4568 static void 4569 ibd_async_txsched(ibd_state_t *state) 4570 { 4571 mutex_enter(&state->id_sched_lock); 4572 state->id_sched_queued = B_FALSE; 4573 mutex_exit(&state->id_sched_lock); 4574 } 4575 4576 /* 4577 * Release one or more chained send wqes back into free list. 4578 */ 4579 static void 4580 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe, 4581 boolean_t send_context) 4582 { 4583 boolean_t call_gld_sched = B_FALSE; 4584 4585 /* 4586 * Add back on Tx list for reuse. 4587 */ 4588 lswqe->swqe_next = NULL; 4589 mutex_enter(&state->id_tx_list.dl_mutex); 4590 if (state->id_tx_list.dl_pending_sends) { 4591 state->id_tx_list.dl_pending_sends = B_FALSE; 4592 call_gld_sched = B_TRUE; 4593 } 4594 if (state->id_tx_list.dl_head == NULL) { 4595 state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe); 4596 } else { 4597 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe); 4598 } 4599 state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe); 4600 mutex_exit(&state->id_tx_list.dl_mutex); 4601 4602 /* 4603 * See comments in ibd_tx_sched(); make sure not to call 4604 * gld_sched() if we are in ibd_send() context. 4605 */ 4606 if (call_gld_sched) 4607 if ((ibd_txcomp_poll == 0) && (!send_context)) 4608 gld_sched(state->id_macinfo); 4609 else 4610 ibd_tx_sched(state); 4611 } 4612 4613 /* 4614 * Acquire a number of chained send wqe's from the free list. Returns the 4615 * number of wqe's actually allocated, and pointers to the first and last 4616 * in the chain. 4617 */ 4618 static int 4619 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, 4620 int number) 4621 { 4622 int numwqe = number; 4623 ibd_swqe_t *node, *wqes; 4624 4625 /* 4626 * Check and reclaim some of the completed Tx requests. 4627 * If someone else is already in this code and pulling Tx 4628 * completions, no need to poll, since the current lock holder 4629 * will do the work anyway. Normally, we poll for completions 4630 * every few Tx attempts, but if we are short on Tx descriptors, 4631 * we always try to poll. 4632 */ 4633 if ((ibd_txcomp_poll == 1) && 4634 (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) == 4635 0) || state->id_tx_list.dl_pending_sends) && 4636 (mutex_tryenter(&state->id_txcomp_lock) != 0)) { 4637 DPRINT(10, "ibd_send : polling"); 4638 ibd_poll_compq(state, state->id_scq_hdl); 4639 mutex_exit(&state->id_txcomp_lock); 4640 } 4641 4642 /* 4643 * Grab required transmit wqes. 4644 */ 4645 mutex_enter(&state->id_tx_list.dl_mutex); 4646 node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head); 4647 while ((node != NULL) && (numwqe-- > 1)) 4648 node = WQE_TO_SWQE(node->swqe_next); 4649 4650 /* 4651 * If we did not find the number we were looking for, flag no resource. 4652 * Adjust list appropriately in either case. 4653 */ 4654 if (numwqe != 0) { 4655 state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL; 4656 state->id_tx_list.dl_pending_sends = B_TRUE; 4657 mutex_exit(&state->id_tx_list.dl_mutex); 4658 DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); 4659 atomic_add_64(&state->id_tx_short, 1); 4660 if (ibd_txcomp_poll == 1) { 4661 /* 4662 * Arrange for a future gld_sched(). Note that when 4663 * the Tx is retried after a little bit, it will 4664 * surely poll the completion queue above. 4665 */ 4666 ibd_tx_sched(state); 4667 } 4668 } else { 4669 state->id_tx_list.dl_head = node->swqe_next; 4670 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node)) 4671 state->id_tx_list.dl_tail = NULL; 4672 mutex_exit(&state->id_tx_list.dl_mutex); 4673 } 4674 4675 /* 4676 * Set return parameters. 4677 */ 4678 *fswqe = wqes; 4679 *lswqe = node; 4680 return (number - numwqe); 4681 } 4682 4683 typedef struct ibd_mpack_s { 4684 ibd_swqe_t *ip_swqe; 4685 uint32_t ip_start, ip_stuff, ip_flags; 4686 ibd_ace_t *ip_ace; 4687 boolean_t ip_copy; 4688 boolean_t ip_noresources; 4689 int ip_segs; 4690 ibt_mr_hdl_t ip_mhdl[IBD_MDTMAX_SEGS + 1]; 4691 ibt_mr_desc_t ip_mdsc[IBD_MDTMAX_SEGS + 1]; 4692 } ibd_mpack_t; 4693 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s)) 4694 4695 static void 4696 ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info) 4697 { 4698 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4699 ibd_mpack_t *ptx = (ibd_mpack_t *)cookie; 4700 ibd_ace_t *ace = ptx->ip_ace; 4701 ibd_swqe_t *wqes, *node = ptx->ip_swqe; 4702 boolean_t docopy = ptx->ip_copy; 4703 uchar_t *pptr; 4704 int i, pktsize, seglen, seg = 0; 4705 4706 /* 4707 * Snag the next wqe before we post this one, since it could complete 4708 * very fast and the wqe could get put at the end of the list, 4709 * corrupting our chain. Set up for the next packet. 4710 */ 4711 wqes = WQE_TO_SWQE(node->swqe_next); 4712 ptx->ip_swqe = wqes; 4713 4714 IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff, 4715 ptx->ip_flags); 4716 node->w_ahandle = ace; 4717 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4718 4719 if (docopy) { 4720 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 4721 pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 4722 pktsize = seglen = PDESC_HDRL(dl_pkt_info); 4723 if (seglen > 0) { 4724 bcopy(dl_pkt_info->hdr_rptr, pptr, seglen); 4725 pptr += seglen; 4726 } 4727 for (; seg < dl_pkt_info->pld_cnt; seg++) 4728 if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) { 4729 bcopy(dl_pkt_info->pld_ary[seg].pld_rptr, 4730 pptr, seglen); 4731 pptr += seglen; 4732 pktsize += seglen; 4733 } 4734 node->w_swr.wr_nds = 1; 4735 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 4736 } else { 4737 seglen = PDESC_HDRL(dl_pkt_info); 4738 if (seglen > 0) { 4739 node->w_smblk_sgl[seg].ds_va = 4740 (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr; 4741 node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey; 4742 node->w_smblk_sgl[seg].ds_len = seglen; 4743 seg++; 4744 } 4745 for (i = 0; i < dl_pkt_info->pld_cnt; i++) { 4746 if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) { 4747 node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t) 4748 (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr; 4749 node->w_smblk_sgl[seg].ds_key = 4750 ptx->ip_mdsc[dl_pkt_info-> 4751 pld_ary[i].pld_pbuf_idx + 1].md_lkey; 4752 node->w_smblk_sgl[seg].ds_len = seglen; 4753 seg++; 4754 } 4755 } 4756 node->w_swr.wr_sgl = node->w_smblk_sgl; 4757 node->w_swr.wr_nds = seg; 4758 } 4759 4760 if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) != 4761 IBT_SUCCESS) { 4762 /* 4763 * We never expect a failure here. But handle it, just in case. 4764 * If this is not the last packet, there are no problems; if 4765 * it is the last packet and the previous ones have not been 4766 * transmitted yet by the hardware, in the registration case, 4767 * the hardware might transmit garbage since we will be 4768 * freemsg'ing. The AH is still safe. 4769 */ 4770 DPRINT(5, "ibd_mdt_txone: posting failed"); 4771 ibd_tx_cleanup(state, node, B_TRUE); 4772 } 4773 } 4774 4775 static int 4776 ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie) 4777 { 4778 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4779 multidata_t *dlmdp = mmd_getmultidata(mp); 4780 ibd_mpack_t *mdinfo; 4781 mbufinfo_t bufinfo, *binfo = &bufinfo; 4782 pattrinfo_t attr_info; 4783 uchar_t *dlap; 4784 ibt_mr_attr_t mem_attr; 4785 ibd_swqe_t *wqes, *node; 4786 ipoib_mac_t *dest; 4787 size_t hsize, psize = 0; 4788 int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL); 4789 int i, ret; 4790 uint32_t end, value; 4791 boolean_t noresources = B_FALSE; 4792 4793 ASSERT(DB_TYPE(mp) == M_MULTIDATA); 4794 ASSERT(mp->b_cont == NULL); 4795 4796 if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0) 4797 return (0); 4798 else if (numwqes != numpackets) 4799 noresources = B_TRUE; 4800 4801 DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node); 4802 4803 /* 4804 * Allocate the cookie that will be passed to subsequent packet 4805 * transmit and post_mdt calls by GLD. We can not sleep, so if 4806 * there is no memory, just tell GLD to drop the entire MDT message. 4807 */ 4808 if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) { 4809 ibd_release_swqes(state, wqes, node, B_TRUE); 4810 return (-1); 4811 } 4812 *cookie = (void *)mdinfo; 4813 mdinfo->ip_noresources = noresources; 4814 4815 /* 4816 * Walk Global Attributes. If TCP failed to provide destination 4817 * information, or some interposing module removed the information, 4818 * fail the entire message. 4819 */ 4820 attr_info.type = PATTR_DSTADDRSAP; 4821 if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) { 4822 ibd_release_swqes(state, wqes, node, B_TRUE); 4823 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4824 return (-1); 4825 } 4826 dlap = ((pattr_addr_t *)attr_info.buf)->addr; 4827 dest = (ipoib_mac_t *)dlap; 4828 4829 /* 4830 * Get the AH for this destination, incrementing the posted 4831 * reference count properly. 4832 */ 4833 if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret, 4834 numwqes)) == NULL) { 4835 ibd_release_swqes(state, wqes, node, B_TRUE); 4836 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4837 return ((ret == GLD_FAILURE) ? -1 : 0); 4838 } 4839 4840 /* 4841 * Depending on how costly it is to copy vs register, we try to 4842 * register, falling back on copying if we fail. 4843 */ 4844 mmd_getregions(dlmdp, &bufinfo); 4845 hsize = binfo->hbuf_wptr - binfo->hbuf_rptr; 4846 for (i = 0; i < binfo->pbuf_cnt; i++) 4847 psize += (binfo->pbuf_ary[i].pbuf_wptr - 4848 binfo->pbuf_ary[i].pbuf_rptr); 4849 if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) { 4850 mdinfo->ip_segs = i + 1; 4851 if (hsize != 0) { 4852 mem_attr.mr_as = NULL; 4853 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4854 mem_attr.mr_vaddr = 4855 (uint64_t)(uintptr_t)binfo->hbuf_rptr; 4856 mem_attr.mr_len = hsize; 4857 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 4858 &mem_attr, &mdinfo->ip_mhdl[0], 4859 &mdinfo->ip_mdsc[0]) != IBT_SUCCESS) 4860 goto ibd_mdt_copy; 4861 DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize); 4862 } 4863 for (i = 0; i < binfo->pbuf_cnt; i++) { 4864 if ((psize = (binfo->pbuf_ary[i].pbuf_wptr - 4865 binfo->pbuf_ary[i].pbuf_rptr)) != 0) { 4866 mem_attr.mr_as = NULL; 4867 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4868 mem_attr.mr_vaddr = (uint64_t)(uintptr_t) 4869 binfo->pbuf_ary[i].pbuf_rptr; 4870 mem_attr.mr_len = psize; 4871 if (ibt_register_mr(state->id_hca_hdl, 4872 state->id_pd_hdl, &mem_attr, 4873 &mdinfo->ip_mhdl[i + 1], 4874 &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) { 4875 for (; i >= 0; i--) { 4876 (void) ibt_deregister_mr( 4877 state->id_hca_hdl, 4878 mdinfo->ip_mhdl[i]); 4879 } 4880 goto ibd_mdt_copy; 4881 } 4882 DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize); 4883 } 4884 } 4885 4886 mdinfo->ip_copy = B_FALSE; 4887 4888 /* 4889 * All the deregistration must happen once the last swqe 4890 * completes. 4891 */ 4892 node->swqe_im_mblk = mp; 4893 node->w_mdtinfo = mdinfo; 4894 DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node); 4895 } else { 4896 ibd_mdt_copy: 4897 mdinfo->ip_copy = B_TRUE; 4898 } 4899 4900 /* 4901 * Do checksum related work. 4902 */ 4903 IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff, 4904 &end, &value, &mdinfo->ip_flags); 4905 4906 mdinfo->ip_swqe = wqes; 4907 return (numwqes); 4908 } 4909 4910 /* ARGSUSED */ 4911 static void 4912 ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie) 4913 { 4914 ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie; 4915 4916 if (mdinfo->ip_copy) { 4917 if (!mdinfo->ip_noresources) 4918 freemsg(mp); 4919 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4920 } 4921 } 4922 4923 /* 4924 * GLD entry point for transmitting a datagram. 4925 * The passed in packet has this format: 4926 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 4927 */ 4928 static int 4929 ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) 4930 { 4931 ibt_status_t ibt_status; 4932 ibt_mr_attr_t mem_attr; 4933 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4934 ibd_ace_t *ace; 4935 ibd_swqe_t *node; 4936 ipoib_mac_t *dest; 4937 ipoib_ptxhdr_t *ipibp; 4938 ip6_t *ip6h; 4939 mblk_t *nmp = mp; 4940 uint_t pktsize; 4941 size_t blksize; 4942 uchar_t *bufp; 4943 int i, ret, len, nmblks = 1; 4944 boolean_t dofree = B_TRUE; 4945 4946 if (ibd_acquire_swqes(state, &node, &node, 1) == 0) 4947 return (GLD_NORESOURCES); 4948 4949 /* 4950 * Obtain an address handle for the destination. 4951 */ 4952 dest = (ipoib_mac_t *)mp->b_rptr; 4953 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 4954 node->w_ahandle = ace; 4955 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4956 } else { 4957 DPRINT(5, 4958 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 4959 ((ret == GLD_FAILURE) ? "failed" : "queued"), 4960 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 4961 htonl(dest->ipoib_gidpref[1]), 4962 htonl(dest->ipoib_gidsuff[0]), 4963 htonl(dest->ipoib_gidsuff[1])); 4964 node->w_ahandle = NULL; 4965 goto ibd_send_fail; 4966 } 4967 4968 /* 4969 * For ND6 packets, padding is at the front of the source lladdr. 4970 * Insert the padding at front. 4971 */ 4972 ipibp = (ipoib_ptxhdr_t *)mp->b_rptr; 4973 if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) { 4974 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) { 4975 if (!pullupmsg(mp, IPV6_HDR_LEN + 4976 sizeof (ipoib_ptxhdr_t))) { 4977 DPRINT(10, "ibd_send: pullupmsg failure "); 4978 ret = GLD_FAILURE; 4979 goto ibd_send_fail; 4980 } 4981 } 4982 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t)); 4983 len = ntohs(ip6h->ip6_plen); 4984 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 4985 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + 4986 IPV6_HDR_LEN + len) { 4987 if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) + 4988 IPV6_HDR_LEN + len)) { 4989 DPRINT(10, "ibd_send: pullupmsg " 4990 "failure "); 4991 ret = GLD_FAILURE; 4992 goto ibd_send_fail; 4993 } 4994 } 4995 /* LINTED: E_CONSTANT_CONDITION */ 4996 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 4997 } 4998 } 4999 5000 mp->b_rptr += IPOIB_ADDRL; 5001 while (((nmp = nmp->b_cont) != NULL) && 5002 (++nmblks < (state->id_max_sqseg + 1))); 5003 pktsize = msgsize(mp); 5004 if (pktsize > state->id_mtu) { 5005 ret = GLD_BADARG; 5006 goto ibd_send_fail; 5007 } 5008 5009 /* 5010 * Do checksum related work. 5011 */ 5012 IBD_CKSUM_SEND(mp); 5013 5014 /* 5015 * Copy the data to preregistered buffers, or register the buffer. 5016 */ 5017 if ((nmblks <= state->id_max_sqseg) && 5018 (pktsize > IBD_TX_COPY_THRESHOLD)) { 5019 for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { 5020 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; 5021 mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; 5022 mem_attr.mr_as = NULL; 5023 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5024 ibt_status = ibt_register_mr(state->id_hca_hdl, 5025 state->id_pd_hdl, &mem_attr, 5026 &node->w_smblkbuf[i].im_mr_hdl, 5027 &node->w_smblkbuf[i].im_mr_desc); 5028 if (ibt_status != IBT_SUCCESS) { 5029 /* 5030 * We do not expect any error other than 5031 * IBT_INSUFF_RESOURCE. 5032 */ 5033 if (ibt_status != IBT_INSUFF_RESOURCE) 5034 DPRINT(10, "ibd_send:%d\n", 5035 "failed in ibt_register_mem()", 5036 ibt_status); 5037 DPRINT(5, "ibd_send: registration failed"); 5038 node->w_swr.wr_nds = i; 5039 /* 5040 * Deregister already registered memory; 5041 * fallback to copying the mblk. 5042 */ 5043 ibd_deregister_mr(state, node); 5044 goto ibd_copy_path; 5045 } 5046 node->w_smblk_sgl[i].ds_va = 5047 (ib_vaddr_t)(uintptr_t)nmp->b_rptr; 5048 node->w_smblk_sgl[i].ds_key = 5049 node->w_smblkbuf[i].im_mr_desc.md_lkey; 5050 node->w_smblk_sgl[i].ds_len = 5051 nmp->b_wptr - nmp->b_rptr; 5052 } 5053 node->swqe_im_mblk = mp; 5054 node->w_swr.wr_sgl = node->w_smblk_sgl; 5055 node->w_swr.wr_nds = nmblks; 5056 dofree = B_FALSE; 5057 } else { 5058 ibd_copy_path: 5059 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5060 node->w_swr.wr_nds = 1; 5061 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5062 5063 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5064 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 5065 blksize = MBLKL(nmp); 5066 bcopy(nmp->b_rptr, bufp, blksize); 5067 bufp += blksize; 5068 } 5069 } 5070 5071 /* 5072 * Queue the wqe to hardware. 5073 */ 5074 ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); 5075 if (ibt_status != IBT_SUCCESS) { 5076 /* 5077 * We should not fail here; but just in case we do, we 5078 * tell GLD about this error. 5079 */ 5080 ret = GLD_FAILURE; 5081 DPRINT(5, "ibd_send: posting failed"); 5082 goto ibd_send_fail; 5083 } 5084 5085 DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", 5086 INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), 5087 htonl(ace->ac_mac.ipoib_gidpref[0]), 5088 htonl(ace->ac_mac.ipoib_gidpref[1]), 5089 htonl(ace->ac_mac.ipoib_gidsuff[0]), 5090 htonl(ace->ac_mac.ipoib_gidsuff[1])); 5091 5092 if (dofree) 5093 freemsg(mp); 5094 5095 return (GLD_SUCCESS); 5096 5097 ibd_send_fail: 5098 ibd_tx_cleanup(state, node, B_TRUE); 5099 return (ret); 5100 } 5101 5102 /* 5103 * GLD entry point for handling interrupts. When using combined CQ, 5104 * this handles Tx and Rx completions. With separate CQs, this handles 5105 * only Rx completions. 5106 */ 5107 static uint_t 5108 ibd_intr(gld_mac_info_t *macinfo) 5109 { 5110 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 5111 5112 /* 5113 * Poll for completed entries; the CQ will not interrupt any 5114 * more for incoming (or transmitted) packets. 5115 */ 5116 ibd_poll_compq(state, state->id_rcq_hdl); 5117 5118 /* 5119 * Now enable CQ notifications; all packets that arrive now 5120 * (or complete transmission) will cause new interrupts. 5121 */ 5122 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 5123 IBT_SUCCESS) { 5124 /* 5125 * We do not expect a failure here. 5126 */ 5127 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5128 } 5129 5130 /* 5131 * Repoll to catch all packets that might have arrived after 5132 * we finished the first poll loop and before interrupts got 5133 * armed. 5134 */ 5135 ibd_poll_compq(state, state->id_rcq_hdl); 5136 5137 return (DDI_INTR_CLAIMED); 5138 } 5139 5140 /* 5141 * Common code for interrupt handling as well as for polling 5142 * for all completed wqe's while detaching. 5143 */ 5144 static void 5145 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5146 { 5147 ibd_wqe_t *wqe; 5148 ibt_wc_t *wc, *wcs; 5149 uint_t numwcs; 5150 int i; 5151 5152 /* 5153 * In some cases (eg detaching), this code can be invoked on 5154 * any cpu after disabling cq notification (thus no concurrency 5155 * exists). Apart from that, the following applies normally: 5156 * The receive completion handling is always on the Rx interrupt 5157 * cpu. Transmit completion handling could be from any cpu if 5158 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5159 * is interrupt driven. Combined completion handling is always 5160 * on the interrupt cpu. Thus, lock accordingly and use the 5161 * proper completion array. 5162 */ 5163 if (cq_hdl == state->id_rcq_hdl) 5164 wcs = state->id_wcs; 5165 else 5166 wcs = state->id_txwcs; 5167 5168 while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) { 5169 5170 for (i = 0, wc = wcs; i < numwcs; i++, wc++) { 5171 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5172 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5173 (wqe->w_type == IBD_WQE_RECV)); 5174 if (wc->wc_status != IBT_WC_SUCCESS) { 5175 /* 5176 * Channel being torn down. 5177 */ 5178 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5179 DPRINT(5, "ibd_intr: flush error"); 5180 /* 5181 * Only invoke the Tx handler to 5182 * release possibly held resources 5183 * like AH refcount etc. Can not 5184 * invoke Rx handler because it might 5185 * try adding buffers to the Rx pool 5186 * when we are trying to deinitialize. 5187 */ 5188 if (wqe->w_type == IBD_WQE_RECV) 5189 continue; 5190 } else { 5191 DPRINT(10, "%s %d", 5192 "ibd_intr: Bad CQ status", 5193 wc->wc_status); 5194 } 5195 } 5196 if (wqe->w_type == IBD_WQE_SEND) 5197 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe), 5198 B_FALSE); 5199 else 5200 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5201 } 5202 } 5203 } 5204 5205 /* 5206 * Deregister the mr associated with a given mblk. 5207 */ 5208 static void 5209 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) 5210 { 5211 int i; 5212 5213 DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, 5214 swqe->w_swr.wr_nds); 5215 /* 5216 * If this is an MDT case, process accordingly. 5217 */ 5218 if (swqe->w_mdtinfo != NULL) { 5219 ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo; 5220 5221 for (i = 0; i < mdinfo->ip_segs; i++) 5222 if ((mdinfo->ip_mhdl[i] != 0) && 5223 (ibt_deregister_mr(state->id_hca_hdl, 5224 mdinfo->ip_mhdl[i]) != IBT_SUCCESS)) 5225 DPRINT(10, "MDT deregistration failed\n"); 5226 ASSERT(!mdinfo->ip_copy); 5227 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 5228 swqe->w_mdtinfo = NULL; 5229 return; 5230 } 5231 5232 for (i = 0; i < swqe->w_swr.wr_nds; i++) { 5233 if (ibt_deregister_mr(state->id_hca_hdl, 5234 swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { 5235 /* 5236 * We do not expect any errors here. 5237 */ 5238 DPRINT(10, "failed in ibt_deregister_mem()\n"); 5239 } 5240 } 5241 } 5242 5243 /* 5244 * Common code that deals with clean ups after a successful or 5245 * erroneous transmission attempt. 5246 */ 5247 static void 5248 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) 5249 { 5250 ibd_ace_t *ace = swqe->w_ahandle; 5251 5252 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5253 5254 /* 5255 * If this was a dynamic registration in ibd_send() or in MDT, 5256 * deregister now. 5257 */ 5258 if (swqe->swqe_im_mblk != NULL) { 5259 ibd_deregister_mr(state, swqe); 5260 freemsg(swqe->swqe_im_mblk); 5261 swqe->swqe_im_mblk = NULL; 5262 } 5263 5264 /* 5265 * Drop the reference count on the AH; it can be reused 5266 * now for a different destination if there are no more 5267 * posted sends that will use it. This can be eliminated 5268 * if we can always associate each Tx buffer with an AH. 5269 * The ace can be null if we are cleaning up from the 5270 * ibd_send() error path. 5271 */ 5272 if (ace != NULL) { 5273 /* 5274 * The recycling logic can be eliminated from here 5275 * and put into the async thread if we create another 5276 * list to hold ACE's for unjoined mcg's. 5277 */ 5278 if (DEC_REF_DO_CYCLE(ace)) { 5279 ibd_mce_t *mce; 5280 5281 /* 5282 * Check with the lock taken: we decremented 5283 * reference count without the lock, and some 5284 * transmitter might alreay have bumped the 5285 * reference count (possible in case of multicast 5286 * disable when we leave the AH on the active 5287 * list). If not still 0, get out, leaving the 5288 * recycle bit intact. 5289 * 5290 * Atomically transition the AH from active 5291 * to free list, and queue a work request to 5292 * leave the group and destroy the mce. No 5293 * transmitter can be looking at the AH or 5294 * the MCE in between, since we have the 5295 * ac_mutex lock. In the SendOnly reap case, 5296 * it is not neccesary to hold the ac_mutex 5297 * and recheck the ref count (since the AH was 5298 * taken off the active list), we just do it 5299 * to have uniform processing with the Full 5300 * reap case. 5301 */ 5302 mutex_enter(&state->id_ac_mutex); 5303 mce = ace->ac_mce; 5304 if (GET_REF_CYCLE(ace) == 0) { 5305 CLEAR_REFCYCLE(ace); 5306 /* 5307 * Identify the case of fullmember reap as 5308 * opposed to mcg trap reap. Also, port up 5309 * might set ac_mce to NULL to indicate Tx 5310 * cleanup should do no more than put the 5311 * AH in the free list (see ibd_async_link). 5312 */ 5313 if (mce != NULL) { 5314 ace->ac_mce = NULL; 5315 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 5316 /* 5317 * mc_req was initialized at mce 5318 * creation time. 5319 */ 5320 ibd_queue_work_slot(state, 5321 &mce->mc_req, ASYNC_REAP); 5322 } 5323 IBD_ACACHE_INSERT_FREE(state, ace); 5324 } 5325 mutex_exit(&state->id_ac_mutex); 5326 } 5327 } 5328 5329 /* 5330 * Release the send wqe for reuse. 5331 */ 5332 ibd_release_swqes(state, swqe, swqe, send_context); 5333 } 5334 5335 /* 5336 * Processing to be done after receipt of a packet; hand off to GLD 5337 * in the format expected by GLD. 5338 * The recvd packet has this format: 2b sap :: 00 :: data. 5339 */ 5340 static void 5341 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5342 { 5343 ipoib_pgrh_t *pgrh; 5344 mblk_t *mp; 5345 ipoib_hdr_t *ipibp; 5346 ip6_t *ip6h; 5347 int rxcnt, len; 5348 5349 /* 5350 * Track number handed to upper layer, and number still 5351 * available to receive packets. 5352 */ 5353 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5354 ASSERT(rxcnt >= 0); 5355 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5356 5357 /* 5358 * Adjust write pointer depending on how much data came in. 5359 */ 5360 mp = rwqe->rwqe_im_mblk; 5361 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5362 5363 /* 5364 * If the GRH is not valid, indicate to GLD by setting 5365 * the VerTcFlow field to 0. Else, update the pseudoGRH 5366 * so that GLD can determine the source mac of the packet. 5367 */ 5368 pgrh = (ipoib_pgrh_t *)mp->b_rptr; 5369 if (wc->wc_flags & IBT_WC_GRH_PRESENT) 5370 pgrh->ipoib_sqpn = htonl(wc->wc_qpn); 5371 else 5372 pgrh->ipoib_vertcflow = 0; 5373 5374 DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); 5375 5376 /* 5377 * For ND6 packets, padding is at the front of the source/target 5378 * lladdr. However the inet6 layer is not aware of it, hence remove 5379 * the padding from such packets. 5380 */ 5381 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5382 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5383 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5384 if (!pullupmsg(mp, IPV6_HDR_LEN + 5385 sizeof (ipoib_hdr_t))) { 5386 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5387 freemsg(mp); 5388 return; 5389 } 5390 } 5391 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5392 len = ntohs(ip6h->ip6_plen); 5393 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5394 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5395 IPV6_HDR_LEN + len) { 5396 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5397 IPV6_HDR_LEN + len)) { 5398 DPRINT(10, "ibd_process_rx: pullupmsg" 5399 " failed"); 5400 freemsg(mp); 5401 return; 5402 } 5403 } 5404 /* LINTED: E_CONSTANT_CONDITION */ 5405 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5406 } 5407 } 5408 5409 /* 5410 * Hand off to service thread/GLD. When we have hardware that 5411 * does hardware checksum, we will pull the checksum from the 5412 * work completion structure here. 5413 * on interrupt cpu. 5414 */ 5415 ibd_send_up(state, mp); 5416 5417 /* 5418 * Possibly replenish the Rx pool if needed. 5419 */ 5420 if (rxcnt < IBD_RX_THRESHOLD) { 5421 state->id_rx_short++; 5422 if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { 5423 if (ibd_post_rwqe(state, rwqe, B_FALSE) == 5424 DDI_FAILURE) { 5425 ibd_free_rwqe(state, rwqe); 5426 return; 5427 } 5428 } 5429 } 5430 } 5431 5432 /* 5433 * Callback code invoked from STREAMs when the recv data buffer is free 5434 * for recycling. 5435 */ 5436 static void 5437 ibd_freemsg_cb(char *arg) 5438 { 5439 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5440 ibd_state_t *state = rwqe->w_state; 5441 5442 /* 5443 * If the wqe is being destructed, do not attempt recycling. 5444 */ 5445 if (rwqe->w_freeing_wqe == B_TRUE) { 5446 DPRINT(6, "ibd_freemsg_cb: wqe being freed"); 5447 return; 5448 } 5449 5450 /* 5451 * Upper layer has released held mblk. 5452 */ 5453 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5454 5455 if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { 5456 /* 5457 * There are already enough buffers on the Rx ring. 5458 * Free this one up. 5459 */ 5460 rwqe->rwqe_im_mblk = NULL; 5461 ibd_delete_rwqe(state, rwqe); 5462 ibd_free_rwqe(state, rwqe); 5463 DPRINT(6, "ibd_freemsg_cb: free up wqe"); 5464 } else { 5465 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5466 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5467 if (rwqe->rwqe_im_mblk == NULL) { 5468 ibd_delete_rwqe(state, rwqe); 5469 ibd_free_rwqe(state, rwqe); 5470 DPRINT(6, "ibd_freemsg_cb: desballoc failed"); 5471 return; 5472 } 5473 5474 /* 5475 * Post back to h/w. We could actually have more than 5476 * id_num_rwqe WQEs on the list if there were multiple 5477 * ibd_freemsg_cb() calls outstanding (since the lock is 5478 * not held the entire time). This will start getting 5479 * corrected over subsequent ibd_freemsg_cb() calls. 5480 */ 5481 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5482 ibd_delete_rwqe(state, rwqe); 5483 ibd_free_rwqe(state, rwqe); 5484 return; 5485 } 5486 } 5487 } 5488 5489 #ifdef RUN_PERFORMANCE 5490 5491 /* 5492 * To run the performance test, first do the "ifconfig ibdN plumb" on 5493 * the Rx and Tx side. Then use mdb -kw to tweak the following variables: 5494 * ibd_performance=1. 5495 * ibd_receiver=1 on Rx side. 5496 * ibd_sender=1 on Tx side. 5497 * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update 5498 * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will 5499 * make it drop into a 1 minute loop waiting for packets. An 5500 * ifconfig/unplumb on the Tx will cause it to send packets to Rx. 5501 */ 5502 5503 #define IBD_NUM_UNSIGNAL ibd_num_unsignal 5504 #define IBD_TX_PKTSIZE ibd_tx_pktsize 5505 #define IBD_TX_DATASIZE ibd_tx_datasize 5506 5507 static ibd_swqe_t **swqes; 5508 static ibt_wc_t *wcs; 5509 5510 /* 5511 * Set these on Rx and Tx side to do performance run. 5512 */ 5513 static int ibd_performance = 0; 5514 static int ibd_receiver = 0; 5515 static int ibd_sender = 0; 5516 static ipoib_mac_t ibd_dest; 5517 5518 /* 5519 * Interrupt coalescing is achieved by asking for a completion intr 5520 * only every ibd_num_unsignal'th packet. 5521 */ 5522 static int ibd_num_unsignal = 8; 5523 5524 /* 5525 * How big is each packet? 5526 */ 5527 static int ibd_tx_pktsize = 2048; 5528 5529 /* 5530 * Total data size to be transmitted. 5531 */ 5532 static int ibd_tx_datasize = 512*1024*1024; 5533 5534 static volatile boolean_t cq_handler_ran = B_FALSE; 5535 static volatile int num_completions; 5536 5537 /* ARGSUSED */ 5538 static void 5539 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) 5540 { 5541 ibd_state_t *state = (ibd_state_t *)arg; 5542 ibt_cq_hdl_t cqhdl; 5543 ibd_wqe_t *wqe; 5544 uint_t polled, i; 5545 boolean_t cq_enabled = B_FALSE; 5546 5547 if (ibd_receiver == 1) 5548 cqhdl = state->id_rcq_hdl; 5549 else 5550 cqhdl = state->id_scq_hdl; 5551 5552 /* 5553 * Mark the handler as having run and possibly freed up some 5554 * slots. Blocked sends can be retried. 5555 */ 5556 cq_handler_ran = B_TRUE; 5557 5558 repoll: 5559 while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == 5560 IBT_SUCCESS) { 5561 num_completions += polled; 5562 if (ibd_receiver == 1) { 5563 /* 5564 * We can immediately recycle the buffer. No 5565 * need to pass up to any IP layer ... 5566 */ 5567 for (i = 0; i < polled; i++) { 5568 wqe = (ibd_wqe_t *)wcs[i].wc_id; 5569 (void) ibt_post_recv(state->id_chnl_hdl, 5570 &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); 5571 } 5572 } 5573 } 5574 5575 /* 5576 * If we just repolled, we are done; exit. 5577 */ 5578 if (cq_enabled) 5579 return; 5580 5581 /* 5582 * Enable CQ. 5583 */ 5584 if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 5585 /* 5586 * We do not expect a failure here. 5587 */ 5588 cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); 5589 } 5590 cq_enabled = B_TRUE; 5591 5592 /* 5593 * Repoll for packets that came in after we finished previous 5594 * poll loop but before we turned on notifications. 5595 */ 5596 goto repoll; 5597 } 5598 5599 static void 5600 ibd_perf_tx(ibd_state_t *state) 5601 { 5602 ibt_mr_hdl_t mrhdl; 5603 ibt_mr_desc_t mrdesc; 5604 ibt_mr_attr_t mem_attr; 5605 ibt_status_t stat; 5606 ibd_ace_t *ace = NULL; 5607 ibd_swqe_t *node; 5608 uchar_t *sendbuf; 5609 longlong_t stime, etime; 5610 longlong_t sspin, espin, tspin = 0; 5611 int i, reps, packets; 5612 5613 cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", 5614 htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), 5615 htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), 5616 htonl(ibd_dest.ipoib_gidsuff[1])); 5617 if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || 5618 (ibd_dest.ipoib_gidpref[1] == 0)) { 5619 cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); 5620 return; 5621 } 5622 5623 packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); 5624 reps = (packets / IBD_NUM_SWQE); 5625 5626 cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); 5627 cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); 5628 cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); 5629 cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); 5630 cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); 5631 if ((packets % IBD_NUM_UNSIGNAL) != 0) { 5632 /* 5633 * This is required to ensure the last packet will trigger 5634 * a CQ handler callback, thus we can spin waiting fot all 5635 * packets to be received. 5636 */ 5637 cmn_err(CE_CONT, 5638 "ibd_perf_tx: #Packets not multiple of Signal Grp size"); 5639 return; 5640 } 5641 num_completions = 0; 5642 5643 swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, 5644 KM_NOSLEEP); 5645 if (swqes == NULL) { 5646 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5647 return; 5648 } 5649 5650 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5651 if (wcs == NULL) { 5652 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5653 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5654 return; 5655 } 5656 5657 /* 5658 * Get the ud_dest for the destination. 5659 */ 5660 ibd_async_acache(state, &ibd_dest); 5661 mutex_enter(&state->id_ac_mutex); 5662 ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); 5663 mutex_exit(&state->id_ac_mutex); 5664 if (ace == NULL) { 5665 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5666 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5667 cmn_err(CE_CONT, "ibd_perf_tx: no AH"); 5668 return; 5669 } 5670 5671 /* 5672 * Set up the send buffer. 5673 */ 5674 sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); 5675 if (sendbuf == NULL) { 5676 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5677 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5678 cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); 5679 return; 5680 } 5681 5682 /* 5683 * This buffer can be used in the case when we want to 5684 * send data from the same memory area over and over; 5685 * it might help in reducing memory traffic. 5686 */ 5687 mem_attr.mr_vaddr = (uint64_t)sendbuf; 5688 mem_attr.mr_len = IBD_TX_PKTSIZE; 5689 mem_attr.mr_as = NULL; 5690 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5691 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 5692 &mrhdl, &mrdesc) != IBT_SUCCESS) { 5693 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5694 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5695 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5696 cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); 5697 return; 5698 } 5699 5700 /* 5701 * Allocate private send wqe's. 5702 */ 5703 for (i = 0; i < IBD_NUM_SWQE; i++) { 5704 if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { 5705 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5706 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5707 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5708 cmn_err(CE_CONT, "ibd_alloc_swqe failure"); 5709 return; 5710 } 5711 node->w_ahandle = ace; 5712 #if 0 5713 node->w_smblkbuf[0].im_mr_hdl = mrhdl; 5714 node->w_smblkbuf[0].im_mr_desc = mrdesc; 5715 node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; 5716 node->w_smblk_sgl[0].ds_key = 5717 node->w_smblkbuf[0].im_mr_desc.md_lkey; 5718 node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; 5719 node->w_swr.wr_sgl = node->w_smblk_sgl; 5720 #else 5721 node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; 5722 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5723 #endif 5724 5725 /* 5726 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs 5727 * is marked to invoke the CQ handler. That is the only 5728 * way we come to know when the send queue can accept more 5729 * WRs. 5730 */ 5731 if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) 5732 node->w_swr.wr_flags = IBT_WR_NO_FLAGS; 5733 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5734 node->w_swr.wr_nds = 1; 5735 5736 swqes[i] = node; 5737 } 5738 5739 ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); 5740 5741 /* 5742 * Post all the requests. We expect this stream of post's will 5743 * not overwhelm the hardware due to periodic completions and 5744 * pollings that happen out of ibd_perf_handler. 5745 * Post a set of requests, till the channel can accept; after 5746 * that, wait for the CQ handler to notify us that there is more 5747 * space. 5748 */ 5749 stime = gethrtime(); 5750 for (; reps > 0; reps--) 5751 for (i = 0; i < IBD_NUM_SWQE; i++) { 5752 node = swqes[i]; 5753 retry: 5754 if ((stat = ibt_post_send(state->id_chnl_hdl, 5755 &node->w_swr, 1, NULL)) != IBT_SUCCESS) { 5756 if (stat == IBT_CHAN_FULL) { 5757 /* 5758 * Spin till the CQ handler runs 5759 * and then try again. 5760 */ 5761 sspin = gethrtime(); 5762 while (!cq_handler_ran); 5763 espin = gethrtime(); 5764 tspin += (espin - sspin); 5765 cq_handler_ran = B_FALSE; 5766 goto retry; 5767 } 5768 cmn_err(CE_CONT, "post failure %d/%d", stat, i); 5769 goto done; 5770 } 5771 } 5772 5773 done: 5774 /* 5775 * We should really be snapshotting when we get the last 5776 * completion. 5777 */ 5778 while (num_completions != (packets / IBD_NUM_UNSIGNAL)); 5779 etime = gethrtime(); 5780 5781 cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", 5782 num_completions); 5783 cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); 5784 cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); 5785 5786 /* 5787 * Wait a sec for everything to get over. 5788 */ 5789 delay(drv_usectohz(2000000)); 5790 5791 /* 5792 * Reset CQ handler to real one; free resources. 5793 */ 5794 if (ibd_separate_cqs == 0) { 5795 ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); 5796 } else { 5797 if (ibd_txcomp_poll == 0) 5798 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, 5799 state); 5800 else 5801 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5802 } 5803 5804 for (i = 0; i < IBD_NUM_SWQE; i++) 5805 ibd_free_swqe(state, swqes[i]); 5806 (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); 5807 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5808 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5809 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5810 } 5811 5812 static void 5813 ibd_perf_rx(ibd_state_t *state) 5814 { 5815 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5816 if (wcs == NULL) { 5817 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5818 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5819 return; 5820 } 5821 5822 /* 5823 * We do not need to allocate private recv wqe's. We will 5824 * just use the regular ones. 5825 */ 5826 5827 num_completions = 0; 5828 ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); 5829 5830 /* 5831 * Delay for a minute for all the packets to come in from 5832 * transmitter. 5833 */ 5834 cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); 5835 delay(drv_usectohz(60000000)); 5836 cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); 5837 5838 /* 5839 * Reset CQ handler to real one; free resources. 5840 */ 5841 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5842 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5843 } 5844 5845 static void 5846 ibd_perf(ibd_state_t *state) 5847 { 5848 if (ibd_performance == 0) 5849 return; 5850 5851 if (ibd_receiver == 1) { 5852 ibd_perf_rx(state); 5853 return; 5854 } 5855 5856 if (ibd_sender == 1) { 5857 ibd_perf_tx(state); 5858 return; 5859 } 5860 } 5861 5862 #endif /* RUN_PERFORMANCE */ 5863