1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * An implementation of the IPoIB standard based on PSARC 2001/289. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/conf.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/modctl.h> 37 #include <sys/stropts.h> 38 #include <sys/stream.h> 39 #include <sys/strsun.h> 40 #include <sys/strsubr.h> 41 #include <sys/dlpi.h> 42 43 #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/pattr.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Modes of hardware/driver/software checksum, useful for debugging 69 * and performance studies. 70 * 71 * none: h/w (Tavor) and driver does not do checksum, IP software must. 72 * partial: driver does data checksum, IP must provide psuedo header. 73 * perf_partial: driver uses IP provided psuedo cksum as data checksum 74 * (thus, real checksumming is not done). 75 */ 76 typedef enum { 77 IBD_CSUM_NONE, 78 IBD_CSUM_PARTIAL, 79 IBD_CSUM_PERF_PARTIAL 80 } ibd_csum_type_t; 81 82 typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; 83 84 /* 85 * Per interface tunable parameters. 86 */ 87 static uint_t ibd_rx_threshold = 16; 88 static uint_t ibd_tx_current_copy_threshold = 0x10000000; 89 static uint_t ibd_num_rwqe = 4095; /* 1 less than max Tavor CQsize */ 90 static uint_t ibd_num_swqe = 4095; /* 1 less than max Tavor CQsize */ 91 static uint_t ibd_num_ah = 16; 92 static uint_t ibd_hash_size = 16; 93 static uint_t ibd_srv_fifos = 0xffff; 94 static uint_t ibd_fifo_depth = 0; 95 static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; 96 static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; 97 98 /* 99 * The driver can use separate CQs for send and receive queueus. 100 * While using separate CQs, it is possible to put the send CQ 101 * in polling mode, ie not to enable notifications on that CQ. 102 * If both CQs are interrupt driven, currently it is not possible 103 * for their handlers to be invoked concurrently (since Tavor ties 104 * both interrupts to the same PCI intr line); but the handlers 105 * are not coded with a single interrupt cpu assumption (eg 106 * id_num_intrs is incremented atomically). 107 * 108 * The driver private struct uses id_scq_hdl to track the separate 109 * CQ being used for send; the id_rcq_hdl tracks the receive CQ 110 * if using separate CQs, or it tracks the single CQ when using 111 * combined CQ. The id_wcs completion array is used in the combined 112 * CQ case, and for fetching Rx completions in the separate CQs case; 113 * the id_txwcs is used to fetch Tx completions in the separate CQs 114 * case. 115 */ 116 static uint_t ibd_separate_cqs = 1; 117 static uint_t ibd_txcomp_poll = 0; 118 119 /* 120 * Initial number of IBA resources allocated. 121 */ 122 #define IBD_NUM_RWQE ibd_num_rwqe 123 #define IBD_NUM_SWQE ibd_num_swqe 124 #define IBD_NUM_AH ibd_num_ah 125 126 /* when <= threshold, it's faster to copy to a premapped buffer */ 127 #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold 128 129 /* 130 * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will 131 * allocate a new WQE to put on the the rxlist. This value must be <= 132 * IBD_NUM_RWQE/id_num_rwqe. 133 */ 134 #define IBD_RX_THRESHOLD ibd_rx_threshold 135 136 /* 137 * Hash table size for the active AH list. 138 */ 139 #define IBD_HASH_SIZE ibd_hash_size 140 141 /* 142 * Size of completion array to be filled by a single poll call. 143 */ 144 #define IBD_WC_SIZE 16 145 146 /* 147 * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This 148 * is based on our above completion array size. 149 */ 150 #define IBD_TXPOLL_MASK 0xf 151 152 /* 153 * Number of payload areas the MDT code can support. Choose the same value 154 * that we know is supported by TCP/MDT. 155 */ 156 #define IBD_MDTMAX_SEGS 16 157 158 /* 159 * PAD routine called during send/recv context 160 */ 161 #define IBD_SEND 0 162 #define IBD_RECV 1 163 164 /* Driver State Pointer */ 165 void *ibd_list; 166 167 /* Required system entry points */ 168 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 169 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 170 171 /* Required driver entry points for GLD */ 172 static int ibd_reset(gld_mac_info_t *); 173 static int ibd_start(gld_mac_info_t *); 174 static int ibd_stop(gld_mac_info_t *); 175 static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *); 176 static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int); 177 static int ibd_set_promiscuous(gld_mac_info_t *, int); 178 static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *); 179 static int ibd_send(gld_mac_info_t *, mblk_t *); 180 static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **); 181 static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *); 182 static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *); 183 static uint_t ibd_intr(gld_mac_info_t *); 184 185 /* Private driver entry points for GLD */ 186 static int ibd_state_init(ibd_state_t *, dev_info_t *); 187 static void ibd_state_fini(ibd_state_t *); 188 static int ibd_drv_init(ibd_state_t *); 189 static void ibd_drv_fini(ibd_state_t *); 190 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 191 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 192 static void ibd_snet_notices_handler(void *, ib_gid_t, 193 ibt_subnet_event_code_t, ibt_subnet_event_t *); 194 static int ibd_init_txlist(ibd_state_t *); 195 static void ibd_fini_txlist(ibd_state_t *); 196 static int ibd_init_rxlist(ibd_state_t *); 197 static void ibd_fini_rxlist(ibd_state_t *); 198 static void ibd_freemsg_cb(char *); 199 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t); 200 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 201 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); 202 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 203 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 204 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 205 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 206 ibt_async_event_t *); 207 static int ibd_acache_init(ibd_state_t *); 208 static void ibd_acache_fini(ibd_state_t *); 209 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 210 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 211 static void ibd_async_unsetprom(ibd_state_t *, boolean_t); 212 static void ibd_async_setprom(ibd_state_t *, boolean_t); 213 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 214 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 215 static void ibd_async_txsched(ibd_state_t *); 216 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 217 static void ibd_async_work(ibd_state_t *); 218 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 219 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 220 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 221 static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, 222 ipoib_mac_t *); 223 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 224 static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); 225 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 226 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 227 static uint64_t ibd_get_portspeed(ibd_state_t *); 228 229 #ifdef RUN_PERFORMANCE 230 static void ibd_perf(ibd_state_t *); 231 #endif 232 233 /* Streams Module Info */ 234 static struct module_info ibd_minfo = { 235 IBD_IDNUM, /* module ID Number */ 236 "ibd", /* module name */ 237 0, /* min packet size */ 238 INFPSZ, /* maximum packet size */ 239 IBD_HIWAT, /* high water mark */ 240 IBD_LOWAT /* low water mark */ 241 }; 242 243 /* Streams Read Queue */ 244 static struct qinit ibd_rdinit = { 245 NULL, /* put */ 246 gld_rsrv, /* service */ 247 gld_open, /* open */ 248 gld_close, /* close */ 249 NULL, /* unused */ 250 &ibd_minfo, /* parameters */ 251 NULL /* statistics */ 252 }; 253 254 /* Streams Write Queue */ 255 static struct qinit ibd_wrinit = { 256 gld_wput, /* put */ 257 gld_wsrv, /* service */ 258 NULL, /* open */ 259 NULL, /* close */ 260 NULL, /* unused */ 261 &ibd_minfo, /* parameters */ 262 NULL /* statistics */ 263 }; 264 265 /* Stream Operations */ 266 static struct streamtab ibd_streamtab = { 267 &ibd_rdinit, /* read queue */ 268 &ibd_wrinit, /* write queue */ 269 NULL, /* lower read queue (MUX) */ 270 NULL /* lower write queue (MUX) */ 271 }; 272 273 /* Character/Block Operations */ 274 static struct cb_ops ibd_cb_ops = { 275 nulldev, /* open */ 276 nulldev, /* close */ 277 nodev, /* strategy (block) */ 278 nodev, /* print (block) */ 279 nodev, /* dump (block) */ 280 nodev, /* read */ 281 nodev, /* write */ 282 nodev, /* ioctl */ 283 nodev, /* devmap */ 284 nodev, /* mmap */ 285 nodev, /* segmap */ 286 nochpoll, /* chpoll */ 287 ddi_prop_op, /* prop_op */ 288 &ibd_streamtab, /* streams */ 289 D_MP | D_64BIT, /* flags */ 290 CB_REV /* rev */ 291 }; 292 293 /* Driver Operations */ 294 static struct dev_ops ibd_dev_ops = { 295 DEVO_REV, /* struct rev */ 296 0, /* refcnt */ 297 gld_getinfo, /* getinfo */ 298 nulldev, /* identify */ 299 nulldev, /* probe */ 300 ibd_attach, /* attach */ 301 ibd_detach, /* detach */ 302 nodev, /* reset */ 303 &ibd_cb_ops, /* cb_ops */ 304 NULL, /* bus_ops */ 305 nodev /* power */ 306 }; 307 308 /* Module Driver Info */ 309 static struct modldrv ibd_modldrv = { 310 &mod_driverops, 311 "InfiniBand DLPI Driver %I%", 312 &ibd_dev_ops 313 }; 314 315 /* Module Linkage */ 316 static struct modlinkage ibd_modlinkage = { 317 MODREV_1, 318 &ibd_modldrv, 319 NULL 320 }; 321 322 /* 323 * Module Info passed to IBTL during IBT_ATTACH. 324 * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this 325 * data). 326 */ 327 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 328 IBTI_V2, 329 IBT_NETWORK, 330 ibd_async_handler, 331 NULL, 332 "IPIB" 333 }; 334 335 /* 336 * Async operation types. 337 */ 338 #define ASYNC_GETAH 1 339 #define ASYNC_JOIN 2 340 #define ASYNC_LEAVE 3 341 #define ASYNC_PROMON 4 342 #define ASYNC_PROMOFF 5 343 #define ASYNC_REAP 6 344 #define ASYNC_POKE 7 345 #define ASYNC_TRAP 8 346 #define ASYNC_SCHED 9 347 #define ASYNC_LINK 10 348 #define ASYNC_EXIT 11 349 350 /* 351 * Async operation states 352 */ 353 #define NOTSTARTED 0 354 #define ONGOING 1 355 #define COMPLETED 2 356 #define ERRORED 3 357 #define ROUTERED 4 358 359 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF 360 361 #ifdef DEBUG 362 363 static int rxpack = 1, txpack = 1; 364 int debuglevel = 100; 365 static void 366 debug_print(int l, char *fmt, ...) 367 { 368 va_list ap; 369 370 if (l < debuglevel) 371 return; 372 va_start(ap, fmt); 373 vcmn_err(CE_CONT, fmt, ap); 374 va_end(ap); 375 } 376 #define INCRXPACK (rxpack++) 377 #define INCTXPACK (txpack++) 378 #define DPRINT debug_print 379 380 #else /* DEBUG */ 381 382 #define INCRXPACK 0 383 #define INCTXPACK 0 384 #define DPRINT 385 386 #endif /* DEBUG */ 387 388 /* 389 * Common routine to print warning messages; adds in hca guid, port number 390 * and pkey to be able to identify the IBA interface. 391 */ 392 static void 393 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 394 { 395 ib_guid_t hca_guid; 396 char ibd_print_buf[256]; 397 int len; 398 va_list ap; 399 400 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 401 0, "hca-guid", 0); 402 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 403 "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname, 404 state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid, 405 state->id_port, state->id_pkey); 406 va_start(ap, fmt); 407 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 408 fmt, ap); 409 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 410 va_end(ap); 411 } 412 413 /* warlock directives */ 414 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 415 ibd_state_t::id_ah_active)) 416 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) 417 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 418 ibd_state_t::id_req_list)) 419 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 420 ibd_state_t::id_acache_req_cv)) 421 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 422 ibd_state_t::id_multi_req)) 423 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 424 ibd_state_t::id_multi_addr)) 425 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 426 ibd_state_t::id_multi_op)) 427 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 428 ibd_state_t::id_multi_queued)) 429 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 430 ibd_state_t::id_mc_full)) 431 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 432 ibd_state_t::id_mc_non)) 433 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 434 ibd_state_t::id_link_state)) 435 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 436 ibd_state_s::id_tx_list)) 437 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 438 ibd_state_s::id_rx_list)) 439 440 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op)) 441 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) 442 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) 443 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) 444 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) 445 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) 446 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) 447 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) 448 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) 449 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) 450 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) 451 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) 452 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) 453 454 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) 455 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) 456 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) 457 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) 458 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) 459 460 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) 461 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) 462 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) 463 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) 464 465 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) 466 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) 467 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats)) 468 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) 469 470 #ifdef DEBUG 471 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) 472 _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) 473 #endif 474 475 int 476 _init() 477 { 478 int status; 479 480 /* 481 * Sanity check some parameter settings. Tx completion polling 482 * only makes sense with separate CQs for Tx and Rx. 483 */ 484 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 485 cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname, 486 "Setting ibd_txcomp_poll = 0 for combined CQ"); 487 ibd_txcomp_poll = 0; 488 } 489 490 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 491 if (status != 0) { 492 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 493 return (status); 494 } 495 496 status = mod_install(&ibd_modlinkage); 497 if (status != 0) { 498 DPRINT(10, "_init:failed in mod_install()"); 499 ddi_soft_state_fini(&ibd_list); 500 return (status); 501 } 502 503 return (0); 504 } 505 506 int 507 _info(struct modinfo *modinfop) 508 { 509 return (mod_info(&ibd_modlinkage, modinfop)); 510 } 511 512 int 513 _fini() 514 { 515 int status; 516 517 status = mod_remove(&ibd_modlinkage); 518 if (status != 0) 519 return (status); 520 521 ddi_soft_state_fini(&ibd_list); 522 return (0); 523 } 524 525 /* 526 * Convert the GID part of the mac address from network byte order 527 * to host order. 528 */ 529 static void 530 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 531 { 532 ib_sn_prefix_t nbopref; 533 ib_guid_t nboguid; 534 535 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 536 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 537 dgid->gid_prefix = b2h64(nbopref); 538 dgid->gid_guid = b2h64(nboguid); 539 } 540 541 /* 542 * Create the IPoIB address in network byte order from host order inputs. 543 */ 544 static void 545 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 546 ib_guid_t guid) 547 { 548 ib_sn_prefix_t nbopref; 549 ib_guid_t nboguid; 550 551 mac->ipoib_qpn = htonl(qpn); 552 nbopref = h2b64(prefix); 553 nboguid = h2b64(guid); 554 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 555 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 556 } 557 558 /* 559 * Send to the appropriate all-routers group when the IBA multicast group 560 * does not exist, based on whether the target group is v4 or v6. 561 */ 562 static boolean_t 563 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 564 ipoib_mac_t *rmac) 565 { 566 boolean_t retval = B_TRUE; 567 uint32_t adjscope = state->id_scope << 16; 568 uint32_t topword; 569 570 /* 571 * Copy the first 4 bytes in without assuming any alignment of 572 * input mac address; this will have IPoIB signature, flags and 573 * scope bits. 574 */ 575 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 576 topword = ntohl(topword); 577 578 /* 579 * Generate proper address for IPv4/v6, adding in the Pkey properly. 580 */ 581 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 582 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 583 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 584 ((uint32_t)(state->id_pkey << 16))), 585 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 586 else 587 /* 588 * Does not have proper bits in the mgid address. 589 */ 590 retval = B_FALSE; 591 592 return (retval); 593 } 594 595 /* 596 * Implementation of various (software) flavors of send and receive side 597 * checksumming. 598 */ 599 #define IBD_CKSUM_SEND(mp) { \ 600 uint32_t start, stuff, end, value, flags; \ 601 uint32_t cksum, sum; \ 602 uchar_t *dp, *buf; \ 603 uint16_t *up; \ 604 \ 605 if (ibd_csum_send == IBD_CSUM_NONE) \ 606 goto punt_send; \ 607 \ 608 /* \ 609 * Query IP whether Tx cksum needs to be done. \ 610 */ \ 611 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ 612 &value, &flags); \ 613 \ 614 if (flags == HCK_PARTIALCKSUM) { \ 615 dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ 616 up = (uint16_t *)(dp + stuff); \ 617 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 618 end = ((uchar_t *)mp->b_wptr - dp - start); \ 619 cksum = *up; \ 620 *up = 0; \ 621 /* \ 622 * Does NOT handle chained mblks/more than one \ 623 * SGL. Applicable only for a single SGL \ 624 * entry/mblk, where the stuff offset is \ 625 * within the range of buf. \ 626 */ \ 627 buf = (dp + start); \ 628 sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ 629 } else { \ 630 sum = *up; \ 631 } \ 632 DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ 633 start, stuff, end, sum, cksum); \ 634 sum = ~(sum); \ 635 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 636 } \ 637 punt_send: \ 638 ; \ 639 } 640 641 #define IBD_CKSUM_RECV(mp) { \ 642 uchar_t *dp, *buf; \ 643 uint32_t start, end, value, stuff, flags; \ 644 uint16_t *up, frag; \ 645 ipha_t *iphp; \ 646 ipoib_hdr_t *ipibh; \ 647 \ 648 if (ibd_csum_recv == IBD_CSUM_NONE) \ 649 goto punt_recv; \ 650 \ 651 ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ 652 if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ 653 goto punt_recv; \ 654 \ 655 dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ 656 iphp = (ipha_t *)dp; \ 657 frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ 658 if ((frag) & (~IPH_DF)) \ 659 goto punt_recv; \ 660 start = IPH_HDR_LENGTH(iphp); \ 661 if (iphp->ipha_protocol == IPPROTO_TCP) \ 662 stuff = start + 16; \ 663 else if (iphp->ipha_protocol == IPPROTO_UDP) \ 664 stuff = start + 6; \ 665 else \ 666 goto punt_recv; \ 667 \ 668 flags = HCK_PARTIALCKSUM; \ 669 end = ntohs(iphp->ipha_length); \ 670 up = (uint16_t *)(dp + stuff); \ 671 \ 672 if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ 673 buf = (dp + start); \ 674 value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ 675 } else { \ 676 value = (*up); \ 677 } \ 678 if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ 679 value, flags, 0) != 0) \ 680 DPRINT(10, "cksum_recv: value: %x\n", value); \ 681 punt_recv: \ 682 ; \ 683 } 684 685 #define IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) { \ 686 /* \ 687 * Query IP whether Tx cksum needs to be done. \ 688 */ \ 689 if (ibd_csum_send != IBD_CSUM_NONE) \ 690 hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp); \ 691 } 692 693 #define IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) { \ 694 if ((ibd_csum_send != IBD_CSUM_NONE) && \ 695 (fl == HCK_PARTIALCKSUM)) { \ 696 extern uint_t bcksum(uchar_t *, int, uint32_t); \ 697 uint16_t *up; \ 698 uint32_t sum; \ 699 uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE; \ 700 int k; \ 701 \ 702 up = (uint16_t *)(hp + stf); \ 703 if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ 704 sum = *up; \ 705 *up = 0; \ 706 sum = IP_BCSUM_PARTIAL(hp + st, \ 707 PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE, \ 708 sum); \ 709 for (k = 0; k < pinfo->pld_cnt; k++) \ 710 sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\ 711 pld_rptr, PDESC_PLDL(pinfo, k), \ 712 sum); \ 713 } else { \ 714 sum = *up; \ 715 } \ 716 sum = ~(sum); \ 717 *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ 718 } \ 719 } 720 721 /* 722 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 723 * front of optional src/tgt link layer address. Right now Solaris inserts 724 * padding by default at the end. The routine which is doing is nce_xmit() 725 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 726 * the packet comes down from IP layer to the IBD driver, it is in the 727 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 728 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 729 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 730 * 731 * The send routine at IBD driver changes this packet as follows: 732 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 733 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 734 * aligned. 735 * 736 * At the receiving side again ibd_process_rx takes the above packet and 737 * removes the two bytes of front padding and inserts it at the end. This 738 * is since the IP layer does not understand padding at the front. 739 */ 740 #define IBD_PAD_NSNA(ip6h, len, type) { \ 741 uchar_t *nd_lla_ptr; \ 742 icmp6_t *icmp6; \ 743 nd_opt_hdr_t *opt; \ 744 int i; \ 745 \ 746 icmp6 = (icmp6_t *)&ip6h[1]; \ 747 len -= sizeof (nd_neighbor_advert_t); \ 748 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 749 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 750 (len != 0)) { \ 751 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 752 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 753 ASSERT(opt != NULL); \ 754 nd_lla_ptr = (uchar_t *)&opt[1]; \ 755 if (type == 0) { \ 756 for (i = IPOIB_ADDRL; i > 0; i--) \ 757 *(nd_lla_ptr + i + 1) = \ 758 *(nd_lla_ptr + i - 1); \ 759 } else { \ 760 for (i = 0; i < IPOIB_ADDRL; i++) \ 761 *(nd_lla_ptr + i) = \ 762 *(nd_lla_ptr + i + 2); \ 763 } \ 764 *(nd_lla_ptr + i) = 0; \ 765 *(nd_lla_ptr + i + 1) = 0; \ 766 } \ 767 } 768 769 /* 770 * The service fifo code is copied verbatim from Cassini. This can be 771 * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. 772 */ 773 774 typedef caddr_t fifo_obj_t, *p_fifo_obj_t; 775 776 typedef struct _srv_fifo_t { 777 kmutex_t fifo_lock; 778 kcondvar_t fifo_cv; 779 size_t size; 780 uint_t max_index; 781 uint_t rd_index; 782 uint_t wr_index; 783 uint_t objs_pending; 784 p_fifo_obj_t fifo_objs; 785 kthread_t *fifo_thread; 786 void (*drain_func)(caddr_t drain_func_arg); 787 caddr_t drain_func_arg; 788 boolean_t running; 789 callb_cpr_t cprinfo; 790 } srv_fifo_t, *p_srv_fifo_t; 791 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) 792 _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) 793 794 static int 795 _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, 796 void (*drain_func)(), caddr_t drain_func_arg) 797 { 798 int status; 799 p_srv_fifo_t srv_fifo; 800 801 status = DDI_SUCCESS; 802 srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); 803 srv_fifo->size = size; 804 srv_fifo->max_index = size - 1; 805 srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( 806 size * sizeof (fifo_obj_t), KM_SLEEP); 807 mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); 808 cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); 809 srv_fifo->drain_func = drain_func; 810 srv_fifo->drain_func_arg = drain_func_arg; 811 srv_fifo->running = DDI_SUCCESS; 812 srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, 813 (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); 814 if (srv_fifo->fifo_thread == NULL) { 815 cv_destroy(&srv_fifo->fifo_cv); 816 mutex_destroy(&srv_fifo->fifo_lock); 817 kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); 818 kmem_free(srv_fifo, sizeof (srv_fifo_t)); 819 srv_fifo = NULL; 820 status = DDI_FAILURE; 821 } else 822 *handle = srv_fifo; 823 return (status); 824 } 825 826 static void 827 _ddi_srv_fifo_destroy(p_srv_fifo_t handle) 828 { 829 kt_did_t tid = handle->fifo_thread->t_did; 830 831 mutex_enter(&handle->fifo_lock); 832 handle->running = DDI_FAILURE; 833 cv_signal(&handle->fifo_cv); 834 while (handle->running == DDI_FAILURE) 835 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 836 mutex_exit(&handle->fifo_lock); 837 if (handle->objs_pending != 0) 838 cmn_err(CE_NOTE, "!Thread Exit with work undone."); 839 cv_destroy(&handle->fifo_cv); 840 mutex_destroy(&handle->fifo_lock); 841 kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); 842 kmem_free(handle, sizeof (srv_fifo_t)); 843 thread_join(tid); 844 } 845 846 static caddr_t 847 _ddi_srv_fifo_begin(p_srv_fifo_t handle) 848 { 849 #ifndef __lock_lint 850 CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, 851 callb_generic_cpr, "srv_fifo"); 852 #endif /* ! _lock_lint */ 853 return (handle->drain_func_arg); 854 } 855 856 static void 857 _ddi_srv_fifo_end(p_srv_fifo_t handle) 858 { 859 callb_cpr_t cprinfo; 860 861 mutex_enter(&handle->fifo_lock); 862 cprinfo = handle->cprinfo; 863 handle->running = DDI_SUCCESS; 864 cv_signal(&handle->fifo_cv); 865 #ifndef __lock_lint 866 CALLB_CPR_EXIT(&cprinfo); 867 #endif /* ! _lock_lint */ 868 thread_exit(); 869 _NOTE(NOT_REACHED) 870 } 871 872 static int 873 _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) 874 { 875 int status; 876 877 mutex_enter(&handle->fifo_lock); 878 status = handle->running; 879 if (status == DDI_SUCCESS) { 880 if (ptr) { 881 if (handle->objs_pending < handle->size) { 882 if (handle->wr_index == handle->max_index) 883 handle->wr_index = 0; 884 else 885 handle->wr_index++; 886 handle->fifo_objs[handle->wr_index] = ptr; 887 handle->objs_pending++; 888 } else 889 status = DDI_FAILURE; 890 if (signal) 891 cv_signal(&handle->fifo_cv); 892 } else { 893 if (signal && (handle->objs_pending > 0)) 894 cv_signal(&handle->fifo_cv); 895 } 896 } 897 mutex_exit(&handle->fifo_lock); 898 return (status); 899 } 900 901 static int 902 _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) 903 { 904 int status; 905 906 mutex_enter(&handle->fifo_lock); 907 status = handle->running; 908 if (status == DDI_SUCCESS) { 909 if (handle->objs_pending == 0) { 910 #ifndef __lock_lint 911 CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); 912 cv_wait(&handle->fifo_cv, &handle->fifo_lock); 913 CALLB_CPR_SAFE_END(&handle->cprinfo, 914 &handle->fifo_lock); 915 #endif /* !_lock_lint */ 916 *ptr = NULL; 917 } 918 if (handle->objs_pending > 0) { 919 if (handle->rd_index == handle->max_index) 920 handle->rd_index = 0; 921 else 922 handle->rd_index++; 923 *ptr = handle->fifo_objs[handle->rd_index]; 924 handle->objs_pending--; 925 } 926 status = handle->running; 927 } else { 928 if (handle->objs_pending) { 929 if (handle->rd_index == handle->max_index) 930 handle->rd_index = 0; 931 else 932 handle->rd_index++; 933 *ptr = handle->fifo_objs[handle->rd_index]; 934 handle->objs_pending--; 935 status = DDI_SUCCESS; 936 } else 937 status = DDI_FAILURE; 938 } 939 mutex_exit(&handle->fifo_lock); 940 return (status); 941 } 942 943 /* 944 * [un]map_rx_srv_fifos has been modified from its CE version. 945 */ 946 static void 947 drain_fifo(p_srv_fifo_t handle) 948 { 949 ibd_state_t *state; 950 mblk_t *mp; 951 952 state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); 953 while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { 954 /* 955 * Hand off to GLD. 956 */ 957 IBD_CKSUM_RECV(mp); 958 gld_recv(state->id_macinfo, mp); 959 } 960 _ddi_srv_fifo_end(handle); 961 } 962 963 static p_srv_fifo_t * 964 map_rx_srv_fifos(int *nfifos, void *private) 965 { 966 p_srv_fifo_t *srv_fifos; 967 int i, inst_taskqs, depth; 968 969 /* 970 * Default behavior on both sparc and amd cpus in terms of 971 * of worker thread is as follows: (N) indicates worker thread 972 * not enabled , (Y) indicates worker thread enabled. Default of 973 * ibd_srv_fifo is set to 0xffff. The default behavior can be 974 * overridden by setting ibd_srv_fifos to 0 or 1 as shown below. 975 * Worker thread model assigns lower priority to network 976 * processing making system more usable at higher network 977 * loads. 978 * ________________________________________________________ 979 * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff | 980 * |----------------------|---|---|-------|---|---|---------| 981 * | | Sparc | x86 | 982 * |----------------------|---|---|-------|---|---|---------| 983 * | Single CPU |N | Y | N | N | Y | N | 984 * |----------------------|---|---|-------|---|---|---------| 985 * | Multi CPU |N | Y | Y | N | Y | Y | 986 * |______________________|___|___|_______|___|___|_________| 987 */ 988 if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || 989 (ibd_srv_fifos == 0)) { 990 *nfifos = 0; 991 return ((p_srv_fifo_t *)1); 992 } 993 994 *nfifos = inst_taskqs; 995 srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), 996 KM_SLEEP); 997 998 /* 999 * If the administrator has specified a fifo depth, use 1000 * that, else just decide what should be the depth. 1001 */ 1002 if (ibd_fifo_depth == 0) 1003 depth = (IBD_NUM_RWQE / inst_taskqs) + 16; 1004 else 1005 depth = ibd_fifo_depth; 1006 1007 for (i = 0; i < inst_taskqs; i++) 1008 if (_ddi_srv_fifo_create(&srv_fifos[i], 1009 depth, drain_fifo, 1010 (caddr_t)private) != DDI_SUCCESS) 1011 break; 1012 1013 if (i < inst_taskqs) 1014 goto map_rx_srv_fifos_fail1; 1015 1016 goto map_rx_srv_fifos_exit; 1017 1018 map_rx_srv_fifos_fail1: 1019 i--; 1020 for (; i >= 0; i--) { 1021 _ddi_srv_fifo_destroy(srv_fifos[i]); 1022 } 1023 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1024 srv_fifos = NULL; 1025 1026 map_rx_srv_fifos_exit: 1027 return (srv_fifos); 1028 } 1029 1030 static void 1031 unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) 1032 { 1033 int i; 1034 1035 /* 1036 * If this interface was not using service fifos, quickly return. 1037 */ 1038 if (inst_taskqs == 0) 1039 return; 1040 1041 for (i = 0; i < inst_taskqs; i++) { 1042 _ddi_srv_fifo_destroy(srv_fifos[i]); 1043 } 1044 kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); 1045 } 1046 1047 /* 1048 * Choose between sending up the packet directly and handing off 1049 * to a service thread. 1050 */ 1051 static void 1052 ibd_send_up(ibd_state_t *state, mblk_t *mp) 1053 { 1054 p_srv_fifo_t *srvfifo; 1055 ipoib_hdr_t *lhdr; 1056 struct ip *ip_hdr; 1057 struct udphdr *tran_hdr; 1058 uchar_t prot; 1059 int tnum = -1, nfifos = state->id_nfifos; 1060 1061 /* 1062 * Quick path if the interface is not using service fifos. 1063 */ 1064 if (nfifos == 0) { 1065 hand_off: 1066 IBD_CKSUM_RECV(mp); 1067 gld_recv(state->id_macinfo, mp); 1068 return; 1069 } 1070 1071 /* 1072 * Is the packet big enough to look at the IPoIB header 1073 * and basic IP header to determine whether it is an 1074 * IPv4 packet? 1075 */ 1076 if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + 1077 sizeof (struct ip))) { 1078 1079 lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); 1080 1081 /* 1082 * Is the packet an IP(v4) packet? 1083 */ 1084 if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { 1085 1086 ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + 1087 IPOIB_HDRSIZE); 1088 prot = ip_hdr->ip_p; 1089 1090 /* 1091 * TCP or UDP packet? We use the UDP header, since 1092 * the first few words of both headers are laid out 1093 * similarly (src/dest ports). 1094 */ 1095 if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { 1096 1097 tran_hdr = (struct udphdr *)( 1098 (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); 1099 1100 /* 1101 * Are we within limits of this packet? If 1102 * so, use the destination port to hash to 1103 * a service thread. 1104 */ 1105 if (mp->b_wptr >= ((uchar_t *)tran_hdr + 1106 sizeof (*tran_hdr))) 1107 tnum = (ntohs(tran_hdr->uh_dport) + 1108 ntohs(tran_hdr->uh_sport)) % 1109 nfifos; 1110 } 1111 } 1112 } 1113 1114 /* 1115 * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the 1116 * packet up in interrupt context, reducing latency. 1117 */ 1118 if (tnum == -1) { 1119 goto hand_off; 1120 } 1121 1122 srvfifo = (p_srv_fifo_t *)state->id_fifos; 1123 if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, 1124 B_TRUE) != DDI_SUCCESS) 1125 freemsg(mp); 1126 } 1127 1128 /* 1129 * Address handle entries maintained by the driver are kept in the 1130 * free and active lists. Each entry starts out in the free list; 1131 * it migrates to the active list when primed using ibt_get_paths() 1132 * and ibt_modify_ud_dest() for transmission to a specific destination. 1133 * In the active list, the entry has a reference count indicating the 1134 * number of ongoing/uncompleted transmits that reference it. The 1135 * entry is left in the active list even after the reference count 1136 * goes to 0, since successive transmits can find it there and do 1137 * not need to set up another entry (ie the path information is 1138 * cached using the active list). Entries on the active list are 1139 * also hashed using the destination link address as a key for faster 1140 * lookups during transmits. 1141 * 1142 * For any destination address (unicast or multicast, whatever the 1143 * join states), there will be at most one entry in the active list. 1144 * Entries with a 0 reference count on the active list can be reused 1145 * for a transmit to a new destination, if the free list is empty. 1146 * 1147 * The AH free list insertion/deletion is protected with the id_ac_mutex, 1148 * since the async thread and Tx callback handlers insert/delete. The 1149 * active list does not need a lock (all operations are done by the 1150 * async thread) but updates to the reference count are atomically 1151 * done (increments done by Tx path, decrements by the Tx callback handler). 1152 */ 1153 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 1154 list_insert_head(&state->id_ah_free, ce) 1155 #define IBD_ACACHE_GET_FREE(state) \ 1156 list_get_head(&state->id_ah_free) 1157 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 1158 int _ret_; \ 1159 list_insert_head(&state->id_ah_active, ce); \ 1160 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 1161 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1162 ASSERT(_ret_ == 0); \ 1163 } 1164 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 1165 list_remove(&state->id_ah_active, ce); \ 1166 (void) mod_hash_remove(state->id_ah_active_hash, \ 1167 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 1168 } 1169 #define IBD_ACACHE_GET_ACTIVE(state) \ 1170 list_get_head(&state->id_ah_active) 1171 1172 /* 1173 * Membership states for different mcg's are tracked by two lists: 1174 * the "non" list is used for promiscuous mode, when all mcg traffic 1175 * needs to be inspected. This type of membership is never used for 1176 * transmission, so there can not be an AH in the active list 1177 * corresponding to a member in this list. This list does not need 1178 * any protection, since all operations are performed by the async 1179 * thread. 1180 * 1181 * "Full" and "SendOnly" membership is tracked using a single list, 1182 * the "full" list. This is because this single list can then be 1183 * searched during transmit to a multicast group (if an AH for the 1184 * mcg is not found in the active list), since at least one type 1185 * of membership must be present before initiating the transmit. 1186 * This list is also emptied during driver detach, since sendonly 1187 * membership acquired during transmit is dropped at detach time 1188 * alongwith ipv4 broadcast full membership. Insert/deletes to 1189 * this list are done only by the async thread, but it is also 1190 * searched in program context (see multicast disable case), thus 1191 * the id_mc_mutex protects the list. The driver detach path also 1192 * deconstructs the "full" list, but it ensures that the async 1193 * thread will not be accessing the list (by blocking out mcg 1194 * trap handling and making sure no more Tx reaping will happen). 1195 * 1196 * Currently, an IBA attach is done in the SendOnly case too, 1197 * although this is not required. 1198 */ 1199 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1200 list_insert_head(&state->id_mc_full, mce) 1201 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1202 list_insert_head(&state->id_mc_non, mce) 1203 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1204 ibd_mcache_find(mgid, &state->id_mc_full) 1205 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1206 ibd_mcache_find(mgid, &state->id_mc_non) 1207 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1208 list_remove(&state->id_mc_full, mce) 1209 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1210 list_remove(&state->id_mc_non, mce) 1211 1212 /* 1213 * AH and MCE active list manipulation: 1214 * 1215 * Multicast disable requests and MCG delete traps are two cases 1216 * where the active AH entry for the mcg (if any unreferenced one exists) 1217 * will be moved to the free list (to force the next Tx to the mcg to 1218 * join the MCG in SendOnly mode). Port up handling will also move AHs 1219 * from active to free list. 1220 * 1221 * In the case when some transmits are still pending on an entry 1222 * for an mcg, but a multicast disable has already been issued on the 1223 * mcg, there are some options to consider to preserve the join state 1224 * to ensure the emitted packet is properly routed on the IBA fabric. 1225 * For the AH, we can 1226 * 1. take out of active list at multicast disable time. 1227 * 2. take out of active list only when last pending Tx completes. 1228 * For the MCE, we can 1229 * 3. take out of active list at multicast disable time. 1230 * 4. take out of active list only when last pending Tx completes. 1231 * 5. move from active list to stale list at multicast disable time. 1232 * We choose to use 2,4. We use option 4 so that if a multicast enable 1233 * is tried before the pending Tx completes, the enable code finds the 1234 * mce in the active list and just has to make sure it will not be reaped 1235 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1236 * a stale list (#5) that would be checked in the enable code would need 1237 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1238 * after the multicast disable would try to put an AH in the active list, 1239 * and associate the mce it finds in the active list to this new AH, 1240 * whereas the mce is already associated with the previous AH (taken off 1241 * the active list), and will be removed once the pending Tx's complete 1242 * (unless a reference count on mce's is implemented). One implication of 1243 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1244 * grab new references on the AH, further delaying the leave. 1245 * 1246 * In the case of mcg delete (or create) trap when the port is sendonly 1247 * joined, the AH and MCE handling is different: the AH and MCE has to be 1248 * immediately taken off the active lists (forcing a join and path lookup 1249 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1250 * to an mcg as it is repeatedly created and deleted and goes thru 1251 * reincarnations). 1252 * 1253 * When a port is already sendonly joined, and a multicast enable is 1254 * attempted, the same mce structure is promoted; this ensures only a 1255 * single mce on the active list tracks the most powerful join state. 1256 * 1257 * In the case of port up event handling, the MCE for sendonly membership 1258 * is freed up, and the ACE is put into the free list as soon as possible 1259 * (depending on whether posted Tx's have completed). For fullmembership 1260 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1261 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1262 * done; else the mce is deconstructed (mc_fullreap case). 1263 * 1264 * MCG creation and deletion trap handling: 1265 * 1266 * These traps are unreliable (meaning sometimes the trap might never 1267 * be delivered to the subscribed nodes) and may arrive out-of-order 1268 * since they use UD transport. An alternative to relying on these 1269 * unreliable traps is to poll for mcg presence every so often, but 1270 * instead of doing that, we try to be as conservative as possible 1271 * while handling the traps, and hope that the traps do arrive at 1272 * the subscribed nodes soon. Note that if a node is fullmember 1273 * joined to an mcg, it can not possibly receive a mcg create/delete 1274 * trap for that mcg (by fullmember definition); if it does, it is 1275 * an old trap from a previous incarnation of the mcg. 1276 * 1277 * Whenever a trap is received, the driver cleans up its sendonly 1278 * membership to the group; we choose to do a sendonly leave even 1279 * on a creation trap to handle the case of a prior deletion of the mcg 1280 * having gone unnoticed. Consider an example scenario: 1281 * T1: MCG M is deleted, and fires off deletion trap D1. 1282 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1283 * T3: Node N tries to transmit to M, joining in sendonly mode. 1284 * T4: MCG M is deleted, and fires off deletion trap D2. 1285 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1286 * If the trap is D2, then a LEAVE is not required, since the mcg 1287 * is already deleted; but if it is D1, a LEAVE is required. A safe 1288 * approach is to always LEAVE, but the SM may be confused if it 1289 * receives a LEAVE without a prior JOIN. 1290 * 1291 * Management of the non-membership to an mcg is similar to the above, 1292 * except that if the interface is in promiscuous mode, it is required 1293 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1294 * if the re-join attempt fails (in which case a warning message needs 1295 * to be printed), it is not clear whether it failed due to the mcg not 1296 * existing, or some fabric/hca issues, due to the delayed nature of 1297 * trap delivery. Querying the SA to establish presence/absence of the 1298 * mcg is also racy at best. Thus, the driver just prints a warning 1299 * message when it can not rejoin after receiving a create trap, although 1300 * this might be (on rare occassions) a mis-warning if the create trap is 1301 * received after the mcg was deleted. 1302 */ 1303 1304 /* 1305 * Implementation of atomic "recycle" bits and reference count 1306 * on address handles. This utilizes the fact that max reference 1307 * count on any handle is limited by number of send wqes, thus 1308 * high bits in the ac_ref field can be used as the recycle bits, 1309 * and only the low bits hold the number of pending Tx requests. 1310 * This atomic AH reference counting allows the Tx completion 1311 * handler not to acquire the id_ac_mutex to process every completion, 1312 * thus reducing lock contention problems between completion and 1313 * the Tx path. 1314 */ 1315 #define CYCLEVAL 0x80000 1316 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1317 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1318 #define GET_REF(ace) ((ace)->ac_ref) 1319 #define GET_REF_CYCLE(ace) ( \ 1320 /* \ 1321 * Make sure "cycle" bit is set. \ 1322 */ \ 1323 ASSERT(CYCLE_SET(ace)), \ 1324 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1325 ) 1326 #define INC_REF(ace, num) { \ 1327 atomic_add_32(&(ace)->ac_ref, num); \ 1328 } 1329 #define SET_CYCLE_IF_REF(ace) ( \ 1330 CYCLE_SET(ace) ? B_TRUE : \ 1331 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1332 CYCLEVAL ? \ 1333 /* \ 1334 * Clear the "cycle" bit we just set; \ 1335 * ref count known to be 0 from above. \ 1336 */ \ 1337 CLEAR_REFCYCLE(ace), B_FALSE : \ 1338 /* \ 1339 * We set "cycle" bit; let caller know. \ 1340 */ \ 1341 B_TRUE \ 1342 ) 1343 #define DEC_REF_DO_CYCLE(ace) ( \ 1344 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1345 CYCLEVAL ? \ 1346 /* \ 1347 * Ref count known to be 0 from above. \ 1348 */ \ 1349 B_TRUE : \ 1350 B_FALSE \ 1351 ) 1352 1353 static void * 1354 list_get_head(list_t *list) 1355 { 1356 list_node_t *lhead = list_head(list); 1357 1358 if (lhead != NULL) 1359 list_remove(list, lhead); 1360 return (lhead); 1361 } 1362 1363 /* 1364 * This is always guaranteed to be able to queue the work. 1365 */ 1366 static void 1367 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1368 { 1369 /* Initialize request */ 1370 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1371 ptr->rq_op = op; 1372 1373 /* 1374 * Queue provided slot onto request pool. 1375 */ 1376 mutex_enter(&state->id_acache_req_lock); 1377 list_insert_tail(&state->id_req_list, ptr); 1378 1379 /* Go, fetch, async thread */ 1380 cv_signal(&state->id_acache_req_cv); 1381 mutex_exit(&state->id_acache_req_lock); 1382 } 1383 1384 /* 1385 * Main body of the per interface async thread. 1386 */ 1387 static void 1388 ibd_async_work(ibd_state_t *state) 1389 { 1390 ibd_req_t *ptr; 1391 callb_cpr_t cprinfo; 1392 1393 mutex_enter(&state->id_acache_req_lock); 1394 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1395 callb_generic_cpr, "ibd_async_work"); 1396 for (;;) { 1397 ptr = list_get_head(&state->id_req_list); 1398 if (ptr != NULL) { 1399 mutex_exit(&state->id_acache_req_lock); 1400 1401 /* 1402 * Once we have done the operation, there is no 1403 * guarantee the request slot is going to be valid, 1404 * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). 1405 */ 1406 1407 /* Perform the request */ 1408 switch (ptr->rq_op) { 1409 case ASYNC_GETAH: 1410 ibd_async_acache(state, &ptr->rq_mac); 1411 break; 1412 case ASYNC_POKE: 1413 /* 1414 * We need the gld_sched; that 1415 * happens below. No locks are 1416 * needed for the multi_op update. 1417 */ 1418 state->id_multi_op = NOTSTARTED; 1419 break; 1420 case ASYNC_REAP: 1421 ibd_async_reap_group(state, 1422 ptr->rq_ptr, ptr->rq_gid, 1423 IB_MC_JSTATE_FULL); 1424 break; 1425 case ASYNC_LEAVE: 1426 case ASYNC_JOIN: 1427 ibd_async_multicast(state, 1428 ptr->rq_gid, ptr->rq_op); 1429 break; 1430 case ASYNC_PROMON: 1431 ibd_async_setprom(state, B_TRUE); 1432 break; 1433 case ASYNC_PROMOFF: 1434 ibd_async_unsetprom(state, B_TRUE); 1435 break; 1436 case ASYNC_TRAP: 1437 ibd_async_trap(state, ptr); 1438 break; 1439 case ASYNC_SCHED: 1440 ibd_async_txsched(state); 1441 break; 1442 case ASYNC_LINK: 1443 ibd_async_link(state, ptr); 1444 break; 1445 case ASYNC_EXIT: 1446 mutex_enter(&state->id_acache_req_lock); 1447 #ifndef __lock_lint 1448 CALLB_CPR_EXIT(&cprinfo); 1449 #endif /* !__lock_lint */ 1450 _NOTE(NOT_REACHED) 1451 return; 1452 } 1453 1454 /* 1455 * Indicate blocked operation can now be retried. 1456 * Note gld_sched() gets the gld_maclock, 1457 * and the multicast/promiscuous paths 1458 * (ibd_set_multicast(), ibd_set_promiscuous()) 1459 * grab id_acache_req_lock in ibd_queue_work_slot() 1460 * with gld_maclock held, so we must not hold the 1461 * id_acache_req_lock while calling gld_sched to 1462 * prevent deadlock. 1463 */ 1464 gld_sched(state->id_macinfo); 1465 1466 mutex_enter(&state->id_acache_req_lock); 1467 } else { 1468 /* 1469 * Nothing to do: wait till new request arrives. 1470 */ 1471 #ifndef __lock_lint 1472 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1473 cv_wait(&state->id_acache_req_cv, 1474 &state->id_acache_req_lock); 1475 CALLB_CPR_SAFE_END(&cprinfo, 1476 &state->id_acache_req_lock); 1477 #endif /* !_lock_lint */ 1478 } 1479 } 1480 /*NOTREACHED*/ 1481 } 1482 1483 /* 1484 * Return when it is safe to queue requests to the async daemon; primarily 1485 * for subnet trap and async event handling. Disallow requests before the 1486 * daemon is created, and when interface deinitilization starts. 1487 */ 1488 static boolean_t 1489 ibd_async_safe(ibd_state_t *state) 1490 { 1491 mutex_enter(&state->id_trap_lock); 1492 if (state->id_trap_stop) { 1493 mutex_exit(&state->id_trap_lock); 1494 return (B_FALSE); 1495 } 1496 state->id_trap_inprog++; 1497 mutex_exit(&state->id_trap_lock); 1498 return (B_TRUE); 1499 } 1500 1501 /* 1502 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1503 * trap or event handling to complete to kill the async thread and deconstruct 1504 * the mcg/ace list. 1505 */ 1506 static void 1507 ibd_async_done(ibd_state_t *state) 1508 { 1509 mutex_enter(&state->id_trap_lock); 1510 if (--state->id_trap_inprog == 0) 1511 cv_signal(&state->id_trap_cv); 1512 mutex_exit(&state->id_trap_lock); 1513 } 1514 1515 /* 1516 * Hash functions: 1517 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1518 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1519 * These operate on mac addresses input into ibd_send, but there is no 1520 * guarantee on the alignment of the ipoib_mac_t structure. 1521 */ 1522 /*ARGSUSED*/ 1523 static uint_t 1524 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1525 { 1526 ulong_t ptraddr = (ulong_t)key; 1527 uint_t hval; 1528 1529 /* 1530 * If the input address is 4 byte aligned, we can just dereference 1531 * it. This is most common, since IP will send in a 4 byte aligned 1532 * IP header, which implies the 24 byte IPoIB psuedo header will be 1533 * 4 byte aligned too. 1534 */ 1535 if ((ptraddr & 3) == 0) 1536 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1537 1538 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1539 return (hval); 1540 } 1541 1542 static int 1543 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1544 { 1545 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1546 return (0); 1547 else 1548 return (1); 1549 } 1550 1551 /* 1552 * Initialize all the per interface caches and lists; AH cache, 1553 * MCG list etc. 1554 */ 1555 static int 1556 ibd_acache_init(ibd_state_t *state) 1557 { 1558 ibd_ace_t *ce; 1559 int i; 1560 1561 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1562 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1563 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1564 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1565 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1566 offsetof(ibd_ace_t, ac_list)); 1567 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1568 offsetof(ibd_ace_t, ac_list)); 1569 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1570 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1571 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1572 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1573 offsetof(ibd_mce_t, mc_list)); 1574 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1575 offsetof(ibd_mce_t, mc_list)); 1576 list_create(&state->id_req_list, sizeof (ibd_req_t), 1577 offsetof(ibd_req_t, rq_list)); 1578 1579 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1580 IBD_NUM_AH, KM_SLEEP); 1581 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1582 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1583 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1584 ibd_acache_fini(state); 1585 return (DDI_FAILURE); 1586 } else { 1587 CLEAR_REFCYCLE(ce); 1588 ce->ac_mce = NULL; 1589 IBD_ACACHE_INSERT_FREE(state, ce); 1590 } 1591 } 1592 return (DDI_SUCCESS); 1593 } 1594 1595 static void 1596 ibd_acache_fini(ibd_state_t *state) 1597 { 1598 ibd_ace_t *ptr; 1599 1600 mutex_enter(&state->id_ac_mutex); 1601 1602 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1603 ASSERT(GET_REF(ptr) == 0); 1604 (void) ibt_free_ud_dest(ptr->ac_dest); 1605 } 1606 1607 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1608 ASSERT(GET_REF(ptr) == 0); 1609 (void) ibt_free_ud_dest(ptr->ac_dest); 1610 } 1611 1612 list_destroy(&state->id_ah_free); 1613 list_destroy(&state->id_ah_active); 1614 list_destroy(&state->id_mc_full); 1615 list_destroy(&state->id_mc_non); 1616 list_destroy(&state->id_req_list); 1617 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1618 mutex_exit(&state->id_ac_mutex); 1619 mutex_destroy(&state->id_ac_mutex); 1620 mutex_destroy(&state->id_mc_mutex); 1621 mutex_destroy(&state->id_acache_req_lock); 1622 cv_destroy(&state->id_acache_req_cv); 1623 } 1624 1625 /* 1626 * Search AH active hash list for a cached path to input destination. 1627 * If we are "just looking", hold == F. When we are in the Tx path, 1628 * we set hold == T to grab a reference on the AH so that it can not 1629 * be recycled to a new destination while the Tx request is posted. 1630 */ 1631 static ibd_ace_t * 1632 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1633 { 1634 ibd_ace_t *ptr; 1635 1636 ASSERT(mutex_owned(&state->id_ac_mutex)); 1637 1638 /* 1639 * Do hash search. 1640 */ 1641 if (mod_hash_find(state->id_ah_active_hash, 1642 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1643 if (hold) 1644 INC_REF(ptr, num); 1645 return (ptr); 1646 } 1647 return (NULL); 1648 } 1649 1650 /* 1651 * This is called by the tx side; if an initialized AH is found in 1652 * the active list, it is locked down and can be used; if no entry 1653 * is found, an async request is queued to do path resolution. 1654 */ 1655 static ibd_ace_t * 1656 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1657 { 1658 ibd_ace_t *ptr; 1659 1660 /* 1661 * Only attempt to print when we can; in the mdt pattr case, the 1662 * address is not aligned properly. 1663 */ 1664 if (((ulong_t)mac & 3) == 0) 1665 DPRINT(4, 1666 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1667 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1668 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1669 htonl(mac->ipoib_gidsuff[1])); 1670 1671 mutex_enter(&state->id_ac_mutex); 1672 1673 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1674 mutex_exit(&state->id_ac_mutex); 1675 return (ptr); 1676 } 1677 1678 /* 1679 * Implementation of a single outstanding async request; if 1680 * the operation is not started yet, queue a request and move 1681 * to ongoing state. Remember in id_ah_addr for which address 1682 * we are queueing the request, in case we need to flag an error; 1683 * Any further requests, for the same or different address, until 1684 * the operation completes, is sent back to GLD to be retried. 1685 * The async thread will update id_ah_op with an error indication 1686 * or will set it to indicate the next look up can start; either 1687 * way, it will gld_sched() so that all blocked requests come 1688 * back here. 1689 */ 1690 *err = GLD_NORESOURCES; 1691 if (state->id_ah_op == NOTSTARTED) { 1692 /* 1693 * We did not even find the entry; queue a request for it. 1694 */ 1695 bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL); 1696 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH); 1697 state->id_ah_op = ONGOING; 1698 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1699 } else if ((state->id_ah_op != ONGOING) && 1700 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1701 /* 1702 * Check the status of the pathrecord lookup request 1703 * we had queued before. 1704 */ 1705 if (state->id_ah_op == ERRORED) { 1706 *err = GLD_FAILURE; 1707 state->id_ah_error++; 1708 } else { 1709 /* 1710 * ROUTERED case: We need to send to the 1711 * all-router MCG. If we can find the AH for 1712 * the mcg, the Tx will be attempted. If we 1713 * do not find the AH, we return NORESOURCES 1714 * to retry. 1715 */ 1716 ipoib_mac_t routermac; 1717 1718 (void) ibd_get_allroutergroup(state, mac, &routermac); 1719 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1720 numwqe); 1721 } 1722 state->id_ah_op = NOTSTARTED; 1723 } else if ((state->id_ah_op != ONGOING) && 1724 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1725 /* 1726 * This case can happen when we get a higher band 1727 * packet. The easiest way is to reset the state machine 1728 * to accommodate the higher priority packet. 1729 */ 1730 state->id_ah_op = NOTSTARTED; 1731 } 1732 mutex_exit(&state->id_ac_mutex); 1733 1734 /* 1735 * The PathRecord lookup failed; retry any other blocked 1736 * Tx requests that might have come in between when we 1737 * initiated the path lookup and now that were sent back 1738 * to GLD to implement single outstanding lookup scheme. 1739 */ 1740 if (*err == GLD_FAILURE) 1741 gld_sched(state->id_macinfo); 1742 return (ptr); 1743 } 1744 1745 /* 1746 * Grab a not-currently-in-use AH/PathRecord from the active 1747 * list to recycle to a new destination. Only the async thread 1748 * executes this code. 1749 */ 1750 static ibd_ace_t * 1751 ibd_acache_get_unref(ibd_state_t *state) 1752 { 1753 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1754 1755 ASSERT(mutex_owned(&state->id_ac_mutex)); 1756 1757 /* 1758 * Do plain linear search. 1759 */ 1760 while (ptr != NULL) { 1761 /* 1762 * Note that it is possible that the "cycle" bit 1763 * is set on the AH w/o any reference count. The 1764 * mcg must have been deleted, and the tx cleanup 1765 * just decremented the reference count to 0, but 1766 * hasn't gotten around to grabbing the id_ac_mutex 1767 * to move the AH into the free list. 1768 */ 1769 if (GET_REF(ptr) == 0) { 1770 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1771 break; 1772 } 1773 ptr = list_next(&state->id_ah_active, ptr); 1774 } 1775 return (ptr); 1776 } 1777 1778 /* 1779 * Invoked to clean up AH from active list in case of multicast 1780 * disable and to handle sendonly memberships during mcg traps. 1781 * And for port up processing for multicast and unicast AHs. 1782 * Normally, the AH is taken off the active list, and put into 1783 * the free list to be recycled for a new destination. In case 1784 * Tx requests on the AH have not completed yet, the AH is marked 1785 * for reaping (which will put the AH on the free list) once the Tx's 1786 * complete; in this case, depending on the "force" input, we take 1787 * out the AH from the active list right now, or leave it also for 1788 * the reap operation. Returns TRUE if the AH is taken off the active 1789 * list (and either put into the free list right now, or arranged for 1790 * later), FALSE otherwise. 1791 */ 1792 static boolean_t 1793 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1794 { 1795 ibd_ace_t *acactive; 1796 boolean_t ret = B_TRUE; 1797 1798 ASSERT(mutex_owned(&state->id_ac_mutex)); 1799 1800 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1801 1802 /* 1803 * Note that the AH might already have the cycle bit set 1804 * on it; this might happen if sequences of multicast 1805 * enables and disables are coming so fast, that posted 1806 * Tx's to the mcg have not completed yet, and the cycle 1807 * bit is set successively by each multicast disable. 1808 */ 1809 if (SET_CYCLE_IF_REF(acactive)) { 1810 if (!force) { 1811 /* 1812 * The ace is kept on the active list, further 1813 * Tx's can still grab a reference on it; the 1814 * ace is reaped when all pending Tx's 1815 * referencing the AH complete. 1816 */ 1817 ret = B_FALSE; 1818 } else { 1819 /* 1820 * In the mcg trap case, we always pull the 1821 * AH from the active list. And also the port 1822 * up multi/unicast case. 1823 */ 1824 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1825 acactive->ac_mce = NULL; 1826 } 1827 } else { 1828 /* 1829 * Determined the ref count is 0, thus reclaim 1830 * immediately after pulling out the ace from 1831 * the active list. 1832 */ 1833 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1834 acactive->ac_mce = NULL; 1835 IBD_ACACHE_INSERT_FREE(state, acactive); 1836 } 1837 1838 } 1839 return (ret); 1840 } 1841 1842 /* 1843 * Helper function for async path record lookup. If we are trying to 1844 * Tx to a MCG, check our membership, possibly trying to join the 1845 * group if required. If that fails, try to send the packet to the 1846 * all router group (indicated by the redirect output), pointing 1847 * the input mac address to the router mcg address. 1848 */ 1849 static ibd_mce_t * 1850 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1851 { 1852 ib_gid_t mgid; 1853 ibd_mce_t *mce; 1854 ipoib_mac_t routermac; 1855 1856 *redirect = B_FALSE; 1857 ibd_n2h_gid(mac, &mgid); 1858 1859 /* 1860 * Check the FullMember+SendOnlyNonMember list. 1861 * Since we are the only one who manipulates the 1862 * id_mc_full list, no locks are needed. 1863 */ 1864 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1865 if (mce != NULL) { 1866 DPRINT(4, "ibd_async_mcache : already joined to group"); 1867 return (mce); 1868 } 1869 1870 /* 1871 * Not found; try to join(SendOnlyNonMember) and attach. 1872 */ 1873 DPRINT(4, "ibd_async_mcache : not joined to group"); 1874 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1875 NULL) { 1876 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1877 return (mce); 1878 } 1879 1880 /* 1881 * MCGroup not present; try to join the all-router group. If 1882 * any of the following steps succeed, we will be redirecting 1883 * to the all router group. 1884 */ 1885 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1886 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1887 return (NULL); 1888 *redirect = B_TRUE; 1889 ibd_n2h_gid(&routermac, &mgid); 1890 bcopy(&routermac, mac, IPOIB_ADDRL); 1891 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1892 mgid.gid_prefix, mgid.gid_guid); 1893 1894 /* 1895 * Are we already joined to the router group? 1896 */ 1897 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1898 DPRINT(4, "ibd_async_mcache : using already joined router" 1899 "group\n"); 1900 return (mce); 1901 } 1902 1903 /* 1904 * Can we join(SendOnlyNonMember) the router group? 1905 */ 1906 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1907 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1908 NULL) { 1909 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1910 return (mce); 1911 } 1912 1913 return (NULL); 1914 } 1915 1916 /* 1917 * Async path record lookup code. 1918 */ 1919 static void 1920 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1921 { 1922 ibd_ace_t *ce; 1923 ibd_mce_t *mce = NULL; 1924 ibt_path_attr_t path_attr; 1925 ibt_path_info_t path_info; 1926 ib_gid_t destgid; 1927 int ret = NOTSTARTED; 1928 1929 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1930 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1931 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1932 htonl(mac->ipoib_gidsuff[1])); 1933 1934 /* 1935 * Check whether we are trying to transmit to a MCG. 1936 * In that case, we need to make sure we are a member of 1937 * the MCG. 1938 */ 1939 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1940 boolean_t redirected; 1941 1942 /* 1943 * If we can not find or join the group or even 1944 * redirect, error out. 1945 */ 1946 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1947 NULL) { 1948 state->id_ah_op = ERRORED; 1949 return; 1950 } 1951 1952 /* 1953 * If we got redirected, we need to determine whether 1954 * the AH for the new mcg is in the cache already, and 1955 * not pull it in then; otherwise proceed to get the 1956 * path for the new mcg. There is no guarantee that 1957 * if the AH is currently in the cache, it will still be 1958 * there when we look in ibd_acache_lookup(), but that's 1959 * okay, we will come back here. 1960 */ 1961 if (redirected) { 1962 ret = ROUTERED; 1963 DPRINT(4, "ibd_async_acache : redirected to " 1964 "%08X:%08X:%08X:%08X:%08X", 1965 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1966 htonl(mac->ipoib_gidpref[1]), 1967 htonl(mac->ipoib_gidsuff[0]), 1968 htonl(mac->ipoib_gidsuff[1])); 1969 1970 mutex_enter(&state->id_ac_mutex); 1971 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1972 mutex_exit(&state->id_ac_mutex); 1973 DPRINT(4, "ibd_async_acache : router AH found"); 1974 state->id_ah_op = ROUTERED; 1975 return; 1976 } 1977 mutex_exit(&state->id_ac_mutex); 1978 } 1979 } 1980 1981 /* 1982 * Get an AH from the free list. 1983 */ 1984 mutex_enter(&state->id_ac_mutex); 1985 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1986 /* 1987 * No free ones; try to grab an unreferenced active 1988 * one. Maybe we need to make the active list LRU, 1989 * but that will create more work for Tx callbacks. 1990 * Is there a way of not having to pull out the 1991 * entry from the active list, but just indicate it 1992 * is being recycled? Yes, but that creates one more 1993 * check in the fast lookup path. 1994 */ 1995 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1996 /* 1997 * Pretty serious shortage now. 1998 */ 1999 state->id_ah_op = NOTSTARTED; 2000 mutex_exit(&state->id_ac_mutex); 2001 DPRINT(10, "ibd_async_acache : failed to find AH " 2002 "slot\n"); 2003 return; 2004 } 2005 /* 2006 * We could check whether ac_mce points to a SendOnly 2007 * member and drop that membership now. Or do it lazily 2008 * at detach time. 2009 */ 2010 ce->ac_mce = NULL; 2011 } 2012 mutex_exit(&state->id_ac_mutex); 2013 ASSERT(ce->ac_mce == NULL); 2014 2015 /* 2016 * Update the entry. 2017 */ 2018 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 2019 2020 bzero(&path_info, sizeof (path_info)); 2021 bzero(&path_attr, sizeof (ibt_path_attr_t)); 2022 path_attr.pa_sgid = state->id_sgid; 2023 path_attr.pa_num_dgids = 1; 2024 ibd_n2h_gid(&ce->ac_mac, &destgid); 2025 path_attr.pa_dgids = &destgid; 2026 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2027 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2028 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 2029 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 2030 goto error; 2031 } 2032 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 2033 ntohl(ce->ac_mac.ipoib_qpn), 2034 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 2035 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 2036 goto error; 2037 } 2038 2039 /* 2040 * mce is set whenever an AH is being associated with a 2041 * MCG; this will come in handy when we leave the MCG. The 2042 * lock protects Tx fastpath from scanning the active list. 2043 */ 2044 if (mce != NULL) 2045 ce->ac_mce = mce; 2046 mutex_enter(&state->id_ac_mutex); 2047 IBD_ACACHE_INSERT_ACTIVE(state, ce); 2048 state->id_ah_op = ret; 2049 mutex_exit(&state->id_ac_mutex); 2050 return; 2051 error: 2052 /* 2053 * We might want to drop SendOnly membership here if we 2054 * joined above. The lock protects Tx callbacks inserting 2055 * into the free list. 2056 */ 2057 mutex_enter(&state->id_ac_mutex); 2058 state->id_ah_op = ERRORED; 2059 IBD_ACACHE_INSERT_FREE(state, ce); 2060 mutex_exit(&state->id_ac_mutex); 2061 } 2062 2063 /* 2064 * While restoring port's presence on the subnet on a port up, it is possible 2065 * that the port goes down again. 2066 */ 2067 static void 2068 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 2069 { 2070 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 2071 int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN : 2072 GLD_LINKSTATE_UP; 2073 ibd_mce_t *mce, *pmce; 2074 ibd_ace_t *ace, *pace; 2075 2076 DPRINT(10, "ibd_async_link(): %d", opcode); 2077 2078 /* 2079 * On a link up, revalidate the link speed/width. No point doing 2080 * this on a link down, since we will be unable to do SA operations, 2081 * defaulting to the lowest speed. Also notice that we update our 2082 * notion of speed before calling gld_linkstate(), which will do 2083 * neccesary higher level notifications for speed changes. 2084 */ 2085 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 2086 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2087 state->id_link_speed = ibd_get_portspeed(state); 2088 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2089 } 2090 2091 /* 2092 * Do all the work required to establish our presence on 2093 * the subnet. 2094 */ 2095 if (opcode == IBD_LINK_UP_ABSENT) { 2096 /* 2097 * If in promiscuous mode ... 2098 */ 2099 if (state->id_prom_op == COMPLETED) { 2100 /* 2101 * Drop all nonmembership. 2102 */ 2103 ibd_async_unsetprom(state, B_FALSE); 2104 2105 /* 2106 * Then, try to regain nonmembership to all mcg's. 2107 */ 2108 ibd_async_setprom(state, B_FALSE); 2109 2110 } 2111 2112 /* 2113 * Drop all sendonly membership (which also gets rid of the 2114 * AHs); try to reacquire all full membership. 2115 */ 2116 mce = list_head(&state->id_mc_full); 2117 while ((pmce = mce) != NULL) { 2118 mce = list_next(&state->id_mc_full, mce); 2119 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 2120 ibd_leave_group(state, 2121 pmce->mc_info.mc_adds_vect.av_dgid, 2122 IB_MC_JSTATE_SEND_ONLY_NON); 2123 else 2124 ibd_reacquire_group(state, pmce); 2125 } 2126 2127 /* 2128 * Recycle all active AHs to free list (and if there are 2129 * pending posts, make sure they will go into the free list 2130 * once the Tx's complete). Grab the lock to prevent 2131 * concurrent Tx's as well as Tx cleanups. 2132 */ 2133 mutex_enter(&state->id_ac_mutex); 2134 ace = list_head(&state->id_ah_active); 2135 while ((pace = ace) != NULL) { 2136 boolean_t cycled; 2137 2138 ace = list_next(&state->id_ah_active, ace); 2139 mce = pace->ac_mce; 2140 cycled = ibd_acache_recycle(state, &pace->ac_mac, 2141 B_TRUE); 2142 /* 2143 * If this is for an mcg, it must be for a fullmember, 2144 * since we got rid of send-only members above when 2145 * processing the mce list. 2146 */ 2147 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2148 IB_MC_JSTATE_FULL))); 2149 2150 /* 2151 * Check if the fullmember mce needs to be torn down, 2152 * ie whether the DLPI disable has already been done. 2153 * If so, do some of the work of tx_cleanup, namely 2154 * causing leave (which will fail), detach and 2155 * mce-freeing. tx_cleanup will put the AH into free 2156 * list. The reason to duplicate some of this 2157 * tx_cleanup work is because we want to delete the 2158 * AH right now instead of waiting for tx_cleanup, to 2159 * force subsequent Tx's to reacquire an AH. 2160 */ 2161 if ((mce != NULL) && (mce->mc_fullreap)) 2162 ibd_async_reap_group(state, mce, 2163 mce->mc_info.mc_adds_vect.av_dgid, 2164 mce->mc_jstate); 2165 } 2166 mutex_exit(&state->id_ac_mutex); 2167 } 2168 2169 /* 2170 * Macinfo is guaranteed to exist since driver does ibt_close_hca() 2171 * (which stops further events from being delivered) before 2172 * gld_mac_free(). At this point, it is guaranteed that gld_register 2173 * has already been done. 2174 */ 2175 mutex_enter(&state->id_link_mutex); 2176 state->id_link_state = lstate; 2177 gld_linkstate(state->id_macinfo, lstate); 2178 mutex_exit(&state->id_link_mutex); 2179 2180 /* 2181 * Free the request slot allocated by the event thread. 2182 */ 2183 kmem_free(req, sizeof (ibd_req_t)); 2184 2185 ibd_async_done(state); 2186 } 2187 2188 /* 2189 * When the link is notified up, we need to do a few things, based 2190 * on the port's current p_init_type_reply claiming a reinit has been 2191 * done or not. The reinit steps are: 2192 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2193 * the old Pkey and GID0 are correct. 2194 * 2. Register for mcg traps (already done by ibmf). 2195 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2196 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2197 * 4. Give up all sendonly memberships. 2198 * 5. Acquire all full memberships. 2199 * 6. In promiscuous mode, acquire all non memberships. 2200 * 7. Recycle all AHs to free list. 2201 */ 2202 static void 2203 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2204 { 2205 ibt_hca_portinfo_t *port_infop; 2206 ibt_status_t ibt_status; 2207 uint_t psize, port_infosz; 2208 ibd_link_op_t opcode; 2209 ibd_req_t *req; 2210 2211 /* 2212 * Do not send a request to the async daemon if it has not 2213 * yet been created or is being destroyed. If the async 2214 * daemon has not yet been created, we still need to track 2215 * last known state of the link. If this code races with the 2216 * detach path, then we are assured that the detach path has 2217 * not yet done the ibt_close_hca (which waits for all async 2218 * events to complete). If the code races with the attach path, 2219 * we need to validate the pkey/gid (in the link_up case) if 2220 * the initialization path has already set these up and created 2221 * IBTF resources based on the values. 2222 */ 2223 mutex_enter(&state->id_link_mutex); 2224 2225 /* 2226 * If the init code in ibd_drv_init hasn't yet set up the 2227 * pkey/gid, nothing to do; that code will set the link state. 2228 */ 2229 if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) { 2230 mutex_exit(&state->id_link_mutex); 2231 return; 2232 } 2233 2234 if (code == IBT_EVENT_PORT_UP) { 2235 uint8_t itreply; 2236 boolean_t badup = B_FALSE; 2237 2238 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2239 state->id_port, &port_infop, &psize, &port_infosz); 2240 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2241 mutex_exit(&state->id_link_mutex); 2242 DPRINT(10, "ibd_link_up : failed in" 2243 " ibt_query_port()\n"); 2244 return; 2245 } 2246 2247 /* 2248 * If the link already went down by the time the handler gets 2249 * here, give up; we can not even validate pkey/gid since those 2250 * are not valid. 2251 */ 2252 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2253 badup = B_TRUE; 2254 2255 itreply = port_infop->p_init_type_reply; 2256 2257 /* 2258 * In InitTypeReply, check if NoLoadReply == 2259 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2260 */ 2261 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2262 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2263 (!badup)) { 2264 /* 2265 * Check that the subnet part of GID0 has not changed. 2266 */ 2267 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2268 sizeof (ib_gid_t)) != 0) 2269 badup = B_TRUE; 2270 2271 /* 2272 * Check that Pkey/index mapping is still valid. 2273 */ 2274 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2275 (port_infop->p_pkey_tbl[state->id_pkix] != 2276 state->id_pkey)) 2277 badup = B_TRUE; 2278 } 2279 2280 /* 2281 * In InitTypeReply, if PreservePresenceReply indicates the SM 2282 * has ensured that the port's presence in mcg, traps etc is 2283 * intact, nothing more to do. 2284 */ 2285 opcode = IBD_LINK_UP_ABSENT; 2286 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2287 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2288 opcode = IBD_LINK_UP; 2289 2290 if (badup) 2291 code = IBT_ERROR_PORT_DOWN; 2292 ibt_free_portinfo(port_infop, port_infosz); 2293 } 2294 2295 if (!ibd_async_safe(state)) { 2296 state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? 2297 GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN); 2298 mutex_exit(&state->id_link_mutex); 2299 return; 2300 } 2301 mutex_exit(&state->id_link_mutex); 2302 2303 if (code == IBT_ERROR_PORT_DOWN) 2304 opcode = IBD_LINK_DOWN; 2305 2306 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 2307 req->rq_ptr = (void *)opcode; 2308 ibd_queue_work_slot(state, req, ASYNC_LINK); 2309 } 2310 2311 /* 2312 * For the port up/down events, IBTL guarantees there will not be concurrent 2313 * invocations of the handler. IBTL might coalesce link transition events, 2314 * and not invoke the handler for _each_ up/down transition, but it will 2315 * invoke the handler with last known state 2316 */ 2317 static void 2318 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2319 ibt_async_code_t code, ibt_async_event_t *event) 2320 { 2321 ibd_state_t *state = (ibd_state_t *)clnt_private; 2322 2323 switch (code) { 2324 case IBT_ERROR_CATASTROPHIC_CHAN: 2325 ibd_print_warn(state, "catastrophic channel error"); 2326 break; 2327 case IBT_ERROR_CQ: 2328 ibd_print_warn(state, "completion queue error"); 2329 break; 2330 case IBT_ERROR_PORT_DOWN: 2331 case IBT_EVENT_PORT_UP: 2332 /* 2333 * Events will be delivered to all instances that have 2334 * done ibt_open_hca() but not yet done ibt_close_hca(). 2335 * Only need to do work for our port; IBTF will deliver 2336 * events for other ports on the hca we have ibt_open_hca'ed 2337 * too. Note that ibd_drv_init() initializes id_port before 2338 * doing ibt_open_hca(). 2339 */ 2340 ASSERT(state->id_hca_hdl == hca_hdl); 2341 if (state->id_port != event->ev_port) 2342 break; 2343 2344 ibd_link_mod(state, code); 2345 break; 2346 2347 case IBT_HCA_ATTACH_EVENT: 2348 case IBT_HCA_DETACH_EVENT: 2349 /* 2350 * When a new card is plugged to the system, attach_event is 2351 * invoked. Additionally, a cfgadm needs to be run to make the 2352 * card known to the system, and an ifconfig needs to be run to 2353 * plumb up any ibd interfaces on the card. In the case of card 2354 * unplug, a cfgadm is run that will trigger any RCM scripts to 2355 * unplumb the ibd interfaces on the card; when the card is 2356 * actually unplugged, the detach_event is invoked; 2357 * additionally, if any ibd instances are still active on the 2358 * card (eg there were no associated RCM scripts), driver's 2359 * detach routine is invoked. 2360 */ 2361 break; 2362 default: 2363 break; 2364 } 2365 } 2366 2367 /* 2368 * Attach device to the IO framework. 2369 */ 2370 static int 2371 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2372 { 2373 ibd_state_t *state; 2374 int instance; 2375 2376 switch (cmd) { 2377 case DDI_ATTACH: 2378 break; 2379 case DDI_RESUME: 2380 /* This driver does not support resume */ 2381 default: 2382 return (DDI_FAILURE); 2383 } 2384 2385 /* 2386 * Allocate soft device data structure 2387 */ 2388 instance = ddi_get_instance(dip); 2389 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2390 return (DDI_FAILURE); 2391 state = ddi_get_soft_state(ibd_list, instance); 2392 2393 /* pre ibt_attach() soft state initialization */ 2394 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2395 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2396 goto attach_fail_state_init; 2397 } 2398 2399 /* "attach" to IBTL */ 2400 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2401 &state->id_ibt_hdl) != IBT_SUCCESS) { 2402 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2403 goto attach_fail_ibt_attach; 2404 } 2405 2406 /* Finish initializing this driver */ 2407 if (ibd_drv_init(state) != DDI_SUCCESS) { 2408 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2409 goto attach_fail_drv_init; 2410 } 2411 2412 /* 2413 * Register ourselves with the GLD interface 2414 * 2415 * gld_register will: 2416 * link us with the GLD module; 2417 * set our ddi_set_driver_private(9F) data to the macinfo ptr; 2418 * save the devinfo pointer in macinfo->gldm_devinfo; 2419 * create the minor device node. 2420 */ 2421 if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) { 2422 DPRINT(10, "ibd_attach : failed in gld_register()"); 2423 goto attach_fail_gld_register; 2424 } 2425 2426 /* 2427 * Setup the handler we will use for regular DLPI stuff. Its important 2428 * to setup the recv handler after registering with gld. Setting it 2429 * before causes at times an incoming packet to be forwarded to gld 2430 * before the gld_register. This will result in gld dropping the packet 2431 * which is ignored by ibd_rcq_handler, thus failing to re-arm the 2432 * tavor events. This will cause tavor_isr on recv path to be not 2433 * invoked any further. 2434 */ 2435 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2436 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2437 IBT_SUCCESS) { 2438 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2439 goto attach_fail_gld_register; 2440 } 2441 2442 /* 2443 * Setup the subnet notices handler after we initialize the a/mcaches 2444 * and start the async thread, both of which are required for the 2445 * trap handler to function properly. Enable the trap handler to 2446 * queue requests to the async thread after the gld_register, because 2447 * the async daemon invokes gld_sched(), which must be done after 2448 * gld_register(). 2449 */ 2450 ibt_register_subnet_notices(state->id_ibt_hdl, 2451 ibd_snet_notices_handler, state); 2452 mutex_enter(&state->id_trap_lock); 2453 state->id_trap_stop = B_FALSE; 2454 mutex_exit(&state->id_trap_lock); 2455 2456 /* 2457 * Indicate link status to GLD and higher layers. By default, 2458 * we assume we are in up state (which must have been true at 2459 * least at the time the broadcast mcg's were probed); if there 2460 * were any up/down transitions till the time we come here, the 2461 * async handler will have updated last known state, which we 2462 * use to tell GLD. The async handler will not send any 2463 * notifications to GLD till we reach here in the initialization 2464 * sequence. 2465 */ 2466 mutex_enter(&state->id_link_mutex); 2467 gld_linkstate(state->id_macinfo, state->id_link_state); 2468 mutex_exit(&state->id_link_mutex); 2469 2470 return (DDI_SUCCESS); 2471 2472 /* Attach failure points, cleanup */ 2473 attach_fail_gld_register: 2474 ibd_drv_fini(state); 2475 2476 attach_fail_drv_init: 2477 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2478 ibd_print_warn(state, "failed to free IB resources"); 2479 2480 attach_fail_ibt_attach: 2481 ibd_state_fini(state); 2482 2483 attach_fail_state_init: 2484 ddi_soft_state_free(ibd_list, instance); 2485 2486 return (DDI_FAILURE); 2487 } 2488 2489 /* 2490 * Detach device from the IO framework. 2491 */ 2492 static int 2493 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2494 { 2495 ibd_state_t *state; 2496 int status; 2497 int instance; 2498 2499 switch (cmd) { 2500 case DDI_DETACH: 2501 break; 2502 case DDI_SUSPEND: 2503 default: 2504 return (DDI_FAILURE); 2505 } 2506 2507 instance = ddi_get_instance(dip); 2508 state = ddi_get_soft_state(ibd_list, instance); 2509 2510 /* 2511 * First, stop receive interrupts; this stops the 2512 * driver from handing up buffers to higher layers. 2513 * Wait for receive buffers to be returned; give up 2514 * after 5 seconds. 2515 */ 2516 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2517 status = 50; 2518 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2519 delay(drv_usectohz(100000)); 2520 if (--status == 0) { 2521 DPRINT(2, "ibd_detach : reclaiming failed"); 2522 goto failed; 2523 } 2524 } 2525 2526 if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) { 2527 DPRINT(10, "ibd_detach : failed in gld_unregister()"); 2528 goto failed; 2529 } 2530 2531 ibd_drv_fini(state); 2532 2533 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2534 ibd_print_warn(state, "failed to free all IB resources at " 2535 "driver detach time"); 2536 2537 ibd_state_fini(state); 2538 ddi_soft_state_free(ibd_list, instance); 2539 return (DDI_SUCCESS); 2540 2541 failed: 2542 /* 2543 * Reap all the Tx/Rx completions that were posted since we 2544 * turned off the notification. Turn on notifications. There 2545 * is a race in that we do not reap completions that come in 2546 * after the poll and before notifications get turned on. That 2547 * is okay, the next rx/tx packet will trigger a completion 2548 * that will reap any missed completions. 2549 */ 2550 ibd_poll_compq(state, state->id_rcq_hdl); 2551 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2552 return (DDI_FAILURE); 2553 } 2554 2555 /* 2556 * Pre ibt_attach() driver initialization 2557 */ 2558 static int 2559 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2560 { 2561 gld_mac_info_t *macinfo; 2562 2563 if ((macinfo = gld_mac_alloc(dip)) == NULL) { 2564 DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()"); 2565 return (DDI_FAILURE); 2566 } 2567 2568 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2569 state->id_link_state = GLD_LINKSTATE_UNKNOWN; 2570 2571 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2572 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2573 state->id_trap_stop = B_TRUE; 2574 state->id_trap_inprog = 0; 2575 2576 mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); 2577 state->id_dip = dip; 2578 state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2579 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); 2580 2581 state->id_sched_queued = B_FALSE; 2582 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2583 2584 state->id_tx_list.dl_head = NULL; 2585 state->id_tx_list.dl_tail = NULL; 2586 state->id_tx_list.dl_pending_sends = B_FALSE; 2587 state->id_tx_list.dl_cnt = 0; 2588 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2589 2590 state->id_rx_list.dl_head = NULL; 2591 state->id_rx_list.dl_tail = NULL; 2592 state->id_rx_list.dl_bufs_outstanding = 0; 2593 state->id_rx_list.dl_cnt = 0; 2594 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2595 2596 /* Link up various structs for later access */ 2597 macinfo->gldm_private = (caddr_t)state; 2598 state->id_macinfo = macinfo; 2599 2600 /* 2601 * Initialize pointers to device specific functions which will be 2602 * used by the generic layer. 2603 */ 2604 macinfo->gldm_reset = ibd_reset; 2605 macinfo->gldm_start = ibd_start; 2606 macinfo->gldm_stop = ibd_stop; 2607 macinfo->gldm_set_mac_addr = ibd_set_mac_addr; 2608 macinfo->gldm_set_multicast = ibd_set_multicast; 2609 macinfo->gldm_set_promiscuous = ibd_set_promiscuous; 2610 macinfo->gldm_get_stats = ibd_get_stats; 2611 macinfo->gldm_send = ibd_send; 2612 macinfo->gldm_intr = ibd_intr; 2613 macinfo->gldm_mdt_pre = ibd_mdt_pre; 2614 macinfo->gldm_mdt_send = ibd_mdt_txone; 2615 macinfo->gldm_mdt_post = ibd_mdt_post; 2616 macinfo->gldm_mdt_sgl = state->id_max_sqseg; 2617 macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS; 2618 2619 /* Initialize board characteristics needed by the generic layer. */ 2620 macinfo->gldm_ident = "InfiniBand DLPI Driver"; 2621 macinfo->gldm_type = DL_IB; 2622 macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */ 2623 macinfo->gldm_addrlen = IPOIB_ADDRL; 2624 macinfo->gldm_saplen = -2; 2625 macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; 2626 2627 /* Other required initialization */ 2628 macinfo->gldm_ppa = ddi_get_instance(dip); 2629 macinfo->gldm_devinfo = dip; 2630 2631 return (DDI_SUCCESS); 2632 } 2633 2634 /* 2635 * Post ibt_detach() driver deconstruction 2636 */ 2637 static void 2638 ibd_state_fini(ibd_state_t *state) 2639 { 2640 mutex_destroy(&state->id_tx_list.dl_mutex); 2641 mutex_destroy(&state->id_rx_list.dl_mutex); 2642 mutex_destroy(&state->id_sched_lock); 2643 mutex_destroy(&state->id_txcomp_lock); 2644 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2645 kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); 2646 cv_destroy(&state->id_trap_cv); 2647 mutex_destroy(&state->id_trap_lock); 2648 mutex_destroy(&state->id_link_mutex); 2649 gld_mac_free(state->id_macinfo); 2650 } 2651 2652 /* 2653 * Fetch IBA parameters for the network device from IB nexus. 2654 */ 2655 static int 2656 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2657 { 2658 /* 2659 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2660 * Note that the default partition is also allowed. 2661 */ 2662 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2663 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2664 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2665 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2666 "partition\n"); 2667 return (DDI_FAILURE); 2668 } 2669 2670 /* 2671 * ... the IBA port ... 2672 */ 2673 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2674 0, "port-number", 0); 2675 if (state->id_port == 0) { 2676 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2677 return (DDI_FAILURE); 2678 } 2679 2680 /* 2681 * ... and HCA GUID. 2682 */ 2683 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2684 0, "hca-guid", 0); 2685 if (*hca_guid == 0) { 2686 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2687 "guid\n"); 2688 return (DDI_FAILURE); 2689 } 2690 2691 return (DDI_SUCCESS); 2692 } 2693 2694 /* 2695 * Fetch link speed from SA for snmp ifspeed reporting. 2696 */ 2697 static uint64_t 2698 ibd_get_portspeed(ibd_state_t *state) 2699 { 2700 int ret; 2701 uint64_t ifspeed; 2702 size_t length; 2703 ib_lid_t lid; 2704 sa_portinfo_record_t req, *resp = NULL; 2705 ibmf_saa_access_args_t args; 2706 ibmf_saa_handle_t saa_handle; 2707 2708 /* 2709 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2710 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2711 * 2000000000. Start with that as default. 2712 */ 2713 ifspeed = 2000000000; 2714 2715 /* Get port lid */ 2716 if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL, 2717 &lid) != IBT_SUCCESS) 2718 goto earlydone; 2719 2720 if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL, 2721 IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS) 2722 goto earlydone; 2723 2724 /* Contact SA Access */ 2725 bzero(&req, sizeof (sa_portinfo_record_t)); 2726 req.EndportLID = lid; 2727 2728 args.sq_attr_id = SA_PORTINFORECORD_ATTRID; 2729 args.sq_access_type = IBMF_SAA_RETRIEVE; 2730 args.sq_component_mask = SA_PORTINFO_COMPMASK_PORTLID; 2731 args.sq_template = &req; 2732 args.sq_callback = NULL; 2733 args.sq_callback_arg = NULL; 2734 2735 ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp); 2736 if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL)) 2737 goto done; 2738 2739 /* 2740 * 4X/12X needs appropriate multipliers. With IBA 1.2 additions, 2741 * double and quad multipliers are also needed per LinkSpeedEnabled. 2742 * In case SA does not return an expected value, report the default 2743 * speed as 1X. 2744 */ 2745 ret = 1; 2746 switch (resp->PortInfo.LinkWidthActive) { 2747 case SM_LINK_WIDTH_ACTIVE_1X: 2748 ret = 1; 2749 break; 2750 case SM_LINK_WIDTH_ACTIVE_4X: 2751 ret = 4; 2752 break; 2753 case SM_LINK_WIDTH_ACTIVE_12X: 2754 ret = 12; 2755 break; 2756 } 2757 ifspeed *= ret; 2758 kmem_free(resp, length); 2759 2760 done: 2761 (void) ibmf_sa_session_close(&saa_handle, 0); 2762 2763 earlydone: 2764 return (ifspeed); 2765 } 2766 2767 /* 2768 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2769 * representing the input mcg mgid. 2770 */ 2771 static ibd_mce_t * 2772 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2773 { 2774 ibd_mce_t *ptr = list_head(mlist); 2775 2776 /* 2777 * Do plain linear search. 2778 */ 2779 while (ptr != NULL) { 2780 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2781 sizeof (ib_gid_t)) == 0) 2782 return (ptr); 2783 ptr = list_next(mlist, ptr); 2784 } 2785 return (NULL); 2786 } 2787 2788 /* 2789 * Execute IBA JOIN. 2790 */ 2791 static ibt_status_t 2792 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2793 { 2794 ibt_mcg_attr_t mcg_attr; 2795 2796 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2797 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2798 mcg_attr.mc_mgid = mgid; 2799 mcg_attr.mc_join_state = mce->mc_jstate; 2800 mcg_attr.mc_scope = state->id_scope; 2801 mcg_attr.mc_pkey = state->id_pkey; 2802 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2803 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2804 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2805 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2806 NULL, NULL)); 2807 } 2808 2809 /* 2810 * This code JOINs the port in the proper way (depending on the join 2811 * state) so that IBA fabric will forward mcg packets to/from the port. 2812 * It also attaches the QPN to the mcg so it can receive those mcg 2813 * packets. This code makes sure not to attach the mcg to the QP if 2814 * that has been previously done due to the mcg being joined with a 2815 * different join state, even though this is not required by SWG_0216, 2816 * refid 3610. 2817 */ 2818 static ibd_mce_t * 2819 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2820 { 2821 ibt_status_t ibt_status; 2822 ibd_mce_t *mce, *tmce, *omce = NULL; 2823 boolean_t do_attach = B_TRUE; 2824 2825 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2826 jstate, mgid.gid_prefix, mgid.gid_guid); 2827 2828 /* 2829 * For enable_multicast Full member joins, we need to do some 2830 * extra work. If there is already an mce on the list that 2831 * indicates full membership, that means the membership has 2832 * not yet been dropped (since the disable_multicast was issued) 2833 * because there are pending Tx's to the mcg; in that case, just 2834 * mark the mce not to be reaped when the Tx completion queues 2835 * an async reap operation. 2836 * 2837 * If there is already an mce on the list indicating sendonly 2838 * membership, try to promote to full membership. Be careful 2839 * not to deallocate the old mce, since there might be an AH 2840 * pointing to it; instead, update the old mce with new data 2841 * that tracks the full membership. 2842 */ 2843 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2844 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2845 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2846 ASSERT(omce->mc_fullreap); 2847 omce->mc_fullreap = B_FALSE; 2848 return (omce); 2849 } else { 2850 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2851 } 2852 } 2853 2854 /* 2855 * Allocate the ibd_mce_t to track this JOIN. 2856 */ 2857 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2858 mce->mc_fullreap = B_FALSE; 2859 mce->mc_jstate = jstate; 2860 2861 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2862 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2863 ibt_status); 2864 kmem_free(mce, sizeof (ibd_mce_t)); 2865 return (NULL); 2866 } 2867 2868 /* 2869 * Is an IBA attach required? Not if the interface is already joined 2870 * to the mcg in a different appropriate join state. 2871 */ 2872 if (jstate == IB_MC_JSTATE_NON) { 2873 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2874 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2875 do_attach = B_FALSE; 2876 } else if (jstate == IB_MC_JSTATE_FULL) { 2877 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2878 do_attach = B_FALSE; 2879 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2880 do_attach = B_FALSE; 2881 } 2882 2883 if (do_attach) { 2884 /* 2885 * Do the IBA attach. 2886 */ 2887 DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); 2888 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2889 &mce->mc_info)) != IBT_SUCCESS) { 2890 DPRINT(10, "ibd_join_group : failed qp attachment " 2891 "%d\n", ibt_status); 2892 /* 2893 * NOTE that we should probably preserve the join info 2894 * in the list and later try to leave again at detach 2895 * time. 2896 */ 2897 (void) ibt_leave_mcg(state->id_sgid, mgid, 2898 state->id_sgid, jstate); 2899 kmem_free(mce, sizeof (ibd_mce_t)); 2900 return (NULL); 2901 } 2902 } 2903 2904 /* 2905 * Insert the ibd_mce_t in the proper list. 2906 */ 2907 if (jstate == IB_MC_JSTATE_NON) { 2908 IBD_MCACHE_INSERT_NON(state, mce); 2909 } else { 2910 /* 2911 * Set up the mc_req fields used for reaping the 2912 * mcg in case of delayed tx completion (see 2913 * ibd_tx_cleanup()). Also done for sendonly join in 2914 * case we are promoted to fullmembership later and 2915 * keep using the same mce. 2916 */ 2917 mce->mc_req.rq_gid = mgid; 2918 mce->mc_req.rq_ptr = mce; 2919 /* 2920 * Check whether this is the case of trying to join 2921 * full member, and we were already joined send only. 2922 * We try to drop our SendOnly membership, but it is 2923 * possible that the mcg does not exist anymore (and 2924 * the subnet trap never reached us), so the leave 2925 * operation might fail. 2926 */ 2927 if (omce != NULL) { 2928 (void) ibt_leave_mcg(state->id_sgid, mgid, 2929 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2930 omce->mc_jstate = IB_MC_JSTATE_FULL; 2931 bcopy(&mce->mc_info, &omce->mc_info, 2932 sizeof (ibt_mcg_info_t)); 2933 kmem_free(mce, sizeof (ibd_mce_t)); 2934 return (omce); 2935 } 2936 mutex_enter(&state->id_mc_mutex); 2937 IBD_MCACHE_INSERT_FULL(state, mce); 2938 mutex_exit(&state->id_mc_mutex); 2939 } 2940 2941 return (mce); 2942 } 2943 2944 /* 2945 * Called during port up event handling to attempt to reacquire full 2946 * membership to an mcg. Stripped down version of ibd_join_group(). 2947 * Note that it is possible that the mcg might have gone away, and 2948 * gets recreated at this point. 2949 */ 2950 static void 2951 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2952 { 2953 ib_gid_t mgid; 2954 2955 /* 2956 * If the mc_fullreap flag is set, or this join fails, a subsequent 2957 * reap/leave is going to try to leave the group. We could prevent 2958 * that by adding a boolean flag into ibd_mce_t, if required. 2959 */ 2960 if (mce->mc_fullreap) 2961 return; 2962 2963 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2964 2965 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2966 mgid.gid_guid); 2967 2968 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2969 ibd_print_warn(state, "Failure on port up to rejoin " 2970 "multicast gid %016llx:%016llx", 2971 (u_longlong_t)mgid.gid_prefix, 2972 (u_longlong_t)mgid.gid_guid); 2973 } 2974 2975 /* 2976 * This code handles delayed Tx completion cleanups for mcg's to which 2977 * disable_multicast has been issued, regular mcg related cleanups during 2978 * disable_multicast, disable_promiscous and mcg traps, as well as 2979 * cleanups during driver detach time. Depending on the join state, 2980 * it deletes the mce from the appropriate list and issues the IBA 2981 * leave/detach; except in the disable_multicast case when the mce 2982 * is left on the active list for a subsequent Tx completion cleanup. 2983 */ 2984 static void 2985 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2986 uint8_t jstate) 2987 { 2988 ibd_mce_t *tmce; 2989 boolean_t do_detach = B_TRUE; 2990 2991 /* 2992 * Before detaching, we must check whether the other list 2993 * contains the mcg; if we detach blindly, the consumer 2994 * who set up the other list will also stop receiving 2995 * traffic. 2996 */ 2997 if (jstate == IB_MC_JSTATE_FULL) { 2998 /* 2999 * The following check is only relevant while coming 3000 * from the Tx completion path in the reap case. 3001 */ 3002 if (!mce->mc_fullreap) 3003 return; 3004 mutex_enter(&state->id_mc_mutex); 3005 IBD_MCACHE_PULLOUT_FULL(state, mce); 3006 mutex_exit(&state->id_mc_mutex); 3007 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3008 do_detach = B_FALSE; 3009 } else if (jstate == IB_MC_JSTATE_NON) { 3010 IBD_MCACHE_PULLOUT_NON(state, mce); 3011 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3012 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3013 do_detach = B_FALSE; 3014 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3015 mutex_enter(&state->id_mc_mutex); 3016 IBD_MCACHE_PULLOUT_FULL(state, mce); 3017 mutex_exit(&state->id_mc_mutex); 3018 do_detach = B_FALSE; 3019 } 3020 3021 /* 3022 * If we are reacting to a mcg trap and leaving our sendonly or 3023 * non membership, the mcg is possibly already gone, so attempting 3024 * to leave might fail. On the other hand, we must try to leave 3025 * anyway, since this might be a trap from long ago, and we could 3026 * have potentially sendonly joined to a recent incarnation of 3027 * the mcg and are about to loose track of this information. 3028 */ 3029 if (do_detach) { 3030 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3031 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3032 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3033 } 3034 3035 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3036 kmem_free(mce, sizeof (ibd_mce_t)); 3037 } 3038 3039 /* 3040 * Async code executed due to multicast and promiscuous disable requests 3041 * and mcg trap handling; also executed during driver detach. Mostly, a 3042 * leave and detach is done; except for the fullmember case when Tx 3043 * requests are pending, whence arrangements are made for subsequent 3044 * cleanup on Tx completion. 3045 */ 3046 static void 3047 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3048 { 3049 ipoib_mac_t mcmac; 3050 boolean_t recycled; 3051 ibd_mce_t *mce; 3052 3053 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3054 jstate, mgid.gid_prefix, mgid.gid_guid); 3055 3056 if (jstate == IB_MC_JSTATE_NON) { 3057 recycled = B_TRUE; 3058 mce = IBD_MCACHE_FIND_NON(state, mgid); 3059 /* 3060 * In case we are handling a mcg trap, we might not find 3061 * the mcg in the non list. 3062 */ 3063 if (mce == NULL) 3064 return; 3065 } else { 3066 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3067 3068 /* 3069 * In case we are handling a mcg trap, make sure the trap 3070 * is not arriving late; if we have an mce that indicates 3071 * that we are already a fullmember, that would be a clear 3072 * indication that the trap arrived late (ie, is for a 3073 * previous incarnation of the mcg). 3074 */ 3075 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3076 if ((mce == NULL) || (mce->mc_jstate == 3077 IB_MC_JSTATE_FULL)) 3078 return; 3079 ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3080 } else { 3081 ASSERT(jstate == IB_MC_JSTATE_FULL); 3082 ASSERT((mce != NULL) && (mce->mc_jstate == 3083 IB_MC_JSTATE_FULL)); 3084 mce->mc_fullreap = B_TRUE; 3085 } 3086 3087 /* 3088 * If no pending Tx's remain that reference the AH 3089 * for the mcg, recycle it from active to free list. 3090 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3091 * so the last completing Tx will cause an async reap 3092 * operation to be invoked, at which time we will drop our 3093 * membership to the mcg so that the pending Tx's complete 3094 * successfully. Refer to comments on "AH and MCE active 3095 * list manipulation" at top of this file. The lock protects 3096 * against Tx fast path and Tx cleanup code. 3097 */ 3098 mutex_enter(&state->id_ac_mutex); 3099 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3100 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3101 IB_MC_JSTATE_SEND_ONLY_NON)); 3102 mutex_exit(&state->id_ac_mutex); 3103 } 3104 3105 if (recycled) { 3106 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3107 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3108 ibd_async_reap_group(state, mce, mgid, jstate); 3109 } 3110 } 3111 3112 /* 3113 * Find the broadcast address as defined by IPoIB; implicitly 3114 * determines the IBA scope, mtu, tclass etc of the link the 3115 * interface is going to be a member of. 3116 */ 3117 static ibt_status_t 3118 ibd_find_bgroup(ibd_state_t *state) 3119 { 3120 ibt_mcg_attr_t mcg_attr; 3121 uint_t numg; 3122 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3123 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3124 IB_MC_SCOPE_GLOBAL }; 3125 int i, mcgmtu; 3126 boolean_t found = B_FALSE; 3127 3128 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3129 mcg_attr.mc_pkey = state->id_pkey; 3130 state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; 3131 3132 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3133 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3134 3135 /* 3136 * Look for the IPoIB broadcast group. 3137 */ 3138 state->id_mgid.gid_prefix = 3139 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3140 ((uint64_t)state->id_scope << 48) | 3141 ((uint32_t)(state->id_pkey << 16))); 3142 mcg_attr.mc_mgid = state->id_mgid; 3143 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3144 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3145 found = B_TRUE; 3146 break; 3147 } 3148 3149 } 3150 3151 if (!found) { 3152 ibd_print_warn(state, "IPoIB broadcast group absent"); 3153 return (IBT_FAILURE); 3154 } 3155 3156 /* 3157 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3158 */ 3159 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3160 if (state->id_mtu < mcgmtu) { 3161 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3162 "greater than port's maximum MTU %d", mcgmtu, 3163 state->id_mtu); 3164 return (IBT_FAILURE); 3165 } 3166 state->id_mtu = mcgmtu; 3167 3168 return (IBT_SUCCESS); 3169 } 3170 3171 /* 3172 * Post ibt_attach() initialization. 3173 */ 3174 static int 3175 ibd_drv_init(ibd_state_t *state) 3176 { 3177 kthread_t *kht; 3178 ibt_ud_chan_alloc_args_t ud_alloc_attr; 3179 ibt_ud_chan_query_attr_t ud_chan_attr; 3180 ibt_hca_portinfo_t *port_infop; 3181 ibt_hca_attr_t hca_attrs; 3182 ibt_status_t ibt_status; 3183 ibt_cq_attr_t cq_attr; 3184 ib_guid_t hca_guid; 3185 uint32_t real_size; 3186 uint32_t *ptr; 3187 char pathname[OBP_MAXPATHLEN]; 3188 uint_t psize, port_infosz; 3189 3190 /* 3191 * Initialize id_port before ibt_open_hca because of 3192 * ordering requirements in port up/down handling. 3193 */ 3194 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 3195 return (DDI_FAILURE); 3196 3197 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 3198 &state->id_hca_hdl) != IBT_SUCCESS) { 3199 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 3200 return (DDI_FAILURE); 3201 } 3202 3203 mutex_enter(&state->id_link_mutex); 3204 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 3205 state->id_port, &port_infop, &psize, 3206 &port_infosz); 3207 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 3208 mutex_exit(&state->id_link_mutex); 3209 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 3210 (void) ibt_close_hca(state->id_hca_hdl); 3211 return (DDI_FAILURE); 3212 } 3213 3214 /* 3215 * If the link already went down by the time we get here, give up; 3216 * we can not even get the gid since that is not valid. We would 3217 * fail in ibd_find_bgroup() anyway. 3218 */ 3219 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3220 mutex_exit(&state->id_link_mutex); 3221 ibt_free_portinfo(port_infop, port_infosz); 3222 (void) ibt_close_hca(state->id_hca_hdl); 3223 ibd_print_warn(state, "Port is not active"); 3224 return (DDI_FAILURE); 3225 } 3226 3227 /* 3228 * This verifies the Pkey ibnexus handed us is still valid. 3229 * This is also the point from which the pkey table for the 3230 * port must hold the exact pkey value at the exact index 3231 * across port up/downs. 3232 */ 3233 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3234 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3235 mutex_exit(&state->id_link_mutex); 3236 ibt_free_portinfo(port_infop, port_infosz); 3237 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3238 (void) ibt_close_hca(state->id_hca_hdl); 3239 return (DDI_FAILURE); 3240 } 3241 3242 state->id_mtu = (128 << port_infop->p_mtu); 3243 state->id_sgid = *port_infop->p_sgid_tbl; 3244 state->id_link_state = GLD_LINKSTATE_UP; 3245 mutex_exit(&state->id_link_mutex); 3246 3247 ibt_free_portinfo(port_infop, port_infosz); 3248 3249 state->id_link_speed = ibd_get_portspeed(state); 3250 3251 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3252 ASSERT(ibt_status == IBT_SUCCESS); 3253 3254 /* 3255 * We need to determine whether the HCA can support checksum 3256 * and indicate that to higher layers. 3257 */ 3258 if (ibd_csum_send > IBD_CSUM_NONE) 3259 state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL; 3260 3261 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3262 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3263 goto drv_init_fail_find_bgroup; 3264 } 3265 state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE; 3266 3267 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3268 &state->id_pd_hdl) != IBT_SUCCESS) { 3269 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3270 goto drv_init_fail_alloc_pd; 3271 } 3272 3273 /* Initialize the parallel ARP cache and AHs */ 3274 if (ibd_acache_init(state) != DDI_SUCCESS) { 3275 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3276 goto drv_init_fail_acache; 3277 } 3278 3279 /* 3280 * Check various tunable limits. 3281 */ 3282 if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { 3283 ibd_print_warn(state, "Setting #sgl = %d instead of default %d", 3284 hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); 3285 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3286 } else { 3287 state->id_max_sqseg = IBD_MAX_SQSEG; 3288 } 3289 3290 /* 3291 * First, check #r/s wqes against max channel size. 3292 */ 3293 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3294 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3295 else 3296 state->id_num_rwqe = IBD_NUM_RWQE; 3297 3298 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3299 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3300 else 3301 state->id_num_swqe = IBD_NUM_SWQE; 3302 3303 /* 3304 * Allocate Rx/combined CQ: 3305 * Theoretically, there is no point in having more than #rwqe 3306 * plus #swqe cqe's, except that the CQ will be signalled for 3307 * overflow when the last wqe completes, if none of the previous 3308 * cqe's have been polled. Thus, we allocate just a few less wqe's 3309 * to make sure such overflow does not occur. 3310 */ 3311 cq_attr.cq_sched = NULL; 3312 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3313 3314 if (ibd_separate_cqs == 1) { 3315 /* 3316 * Allocate Receive CQ. 3317 */ 3318 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3319 cq_attr.cq_size = state->id_num_rwqe + 1; 3320 } else { 3321 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3322 state->id_num_rwqe = cq_attr.cq_size - 1; 3323 } 3324 3325 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3326 ibd_print_warn(state, "Computed #rwqe %d based on " 3327 "requested size and supportable CQ size is less " 3328 "than the required threshold %d", 3329 state->id_num_rwqe, IBD_RX_THRESHOLD); 3330 goto drv_init_fail_min_rwqes; 3331 } 3332 3333 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3334 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3335 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3336 goto drv_init_fail_alloc_rcq; 3337 } 3338 3339 /* 3340 * Allocate Send CQ. 3341 */ 3342 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3343 cq_attr.cq_size = state->id_num_swqe + 1; 3344 } else { 3345 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3346 state->id_num_swqe = cq_attr.cq_size - 1; 3347 } 3348 3349 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3350 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3351 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3352 goto drv_init_fail_alloc_scq; 3353 } 3354 } else { 3355 /* 3356 * Allocate combined Send/Receive CQ. 3357 */ 3358 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3359 state->id_num_swqe + 1)) { 3360 cq_attr.cq_size = state->id_num_rwqe + 3361 state->id_num_swqe + 1; 3362 } else { 3363 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3364 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3365 state->id_num_rwqe) / (state->id_num_rwqe + 3366 state->id_num_swqe); 3367 state->id_num_swqe = cq_attr.cq_size - 1 - 3368 state->id_num_rwqe; 3369 } 3370 3371 if (state->id_num_rwqe < IBD_RX_THRESHOLD) { 3372 ibd_print_warn(state, "Computed #rwqe %d based on " 3373 "requested size and supportable CQ size is less " 3374 "than the required threshold %d", 3375 state->id_num_rwqe, IBD_RX_THRESHOLD); 3376 goto drv_init_fail_min_rwqes; 3377 } 3378 3379 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3380 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3381 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3382 goto drv_init_fail_alloc_rcq; 3383 } 3384 state->id_scq_hdl = state->id_rcq_hdl; 3385 } 3386 3387 /* 3388 * Print message in case we could not allocate as many wqe's 3389 * as was requested. Note that in the combined CQ case, we will 3390 * get the following message. 3391 */ 3392 if (state->id_num_rwqe != IBD_NUM_RWQE) 3393 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3394 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3395 if (state->id_num_swqe != IBD_NUM_SWQE) 3396 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3397 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3398 3399 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3400 ud_alloc_attr.ud_hca_port_num = state->id_port; 3401 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3402 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3403 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3404 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3405 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3406 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3407 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3408 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3409 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3410 ud_alloc_attr.ud_clone_chan = NULL; 3411 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3412 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3413 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3414 "\n"); 3415 goto drv_init_fail_alloc_chan; 3416 } 3417 3418 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3419 DDI_SUCCESS) { 3420 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3421 goto drv_init_fail_query_chan; 3422 } 3423 state->id_qpnum = ud_chan_attr.ud_qpn; 3424 3425 /* Initialize the Transmit buffer list */ 3426 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3427 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3428 goto drv_init_fail_txlist_init; 3429 } 3430 3431 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3432 /* Setup the handler we will use for regular DLPI stuff */ 3433 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3434 if (ibt_enable_cq_notify(state->id_scq_hdl, 3435 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3436 DPRINT(10, "ibd_drv_init : failed in" 3437 " ibt_enable_cq_notify()\n"); 3438 goto drv_init_fail_cq_notify; 3439 } 3440 } 3441 3442 /* Create the service fifos before we start receiving */ 3443 if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, 3444 state)) == NULL) { 3445 DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); 3446 goto drv_init_fail_srv_fifo; 3447 } 3448 3449 /* Initialize the Receive buffer list */ 3450 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3451 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3452 goto drv_init_fail_rxlist_init; 3453 } 3454 3455 /* Join to IPoIB broadcast group as required by IPoIB */ 3456 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3457 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3458 goto drv_init_fail_join_group; 3459 } 3460 3461 /* Create the async thread */ 3462 if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3463 TS_RUN, minclsyspri)) == NULL) { 3464 /* Do we have to specially leave the group? */ 3465 DPRINT(10, "ibd_drv_init : failed in thread_create\n"); 3466 goto drv_init_fail_thread_create; 3467 } 3468 state->id_async_thrid = kht->t_did; 3469 3470 /* 3471 * The local mac address is now known. Create the IPoIB 3472 * address. 3473 */ 3474 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3475 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3476 state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr; 3477 3478 /* 3479 * Similarly, program in the broadcast mac address. 3480 */ 3481 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3482 state->id_mgid.gid_guid); 3483 state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr; 3484 3485 ptr = (uint32_t *)&state->id_macaddr; 3486 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3487 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3488 ptr = (uint32_t *)&state->id_bcaddr; 3489 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3490 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3491 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3492 state->id_pkey, state->id_mgid.gid_prefix, 3493 state->id_mgid.gid_guid); 3494 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3495 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3496 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3497 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3498 (void) ddi_pathname(state->id_dip, pathname); 3499 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3500 3501 return (DDI_SUCCESS); 3502 3503 drv_init_fail_thread_create: 3504 ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); 3505 3506 drv_init_fail_join_group: 3507 ibd_fini_rxlist(state); 3508 3509 drv_init_fail_rxlist_init: 3510 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3511 3512 drv_init_fail_srv_fifo: 3513 drv_init_fail_cq_notify: 3514 ibd_fini_txlist(state); 3515 3516 drv_init_fail_txlist_init: 3517 drv_init_fail_query_chan: 3518 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3519 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3520 3521 drv_init_fail_alloc_chan: 3522 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3523 IBT_SUCCESS)) 3524 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3525 3526 drv_init_fail_alloc_scq: 3527 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3528 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3529 3530 drv_init_fail_min_rwqes: 3531 drv_init_fail_alloc_rcq: 3532 ibd_acache_fini(state); 3533 drv_init_fail_acache: 3534 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3535 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3536 3537 drv_init_fail_alloc_pd: 3538 ibt_free_mcg_info(state->id_mcinfo, 1); 3539 drv_init_fail_find_bgroup: 3540 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3541 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3542 3543 return (DDI_FAILURE); 3544 } 3545 3546 /* 3547 * Allocate the statically allocated Tx buffer list. 3548 */ 3549 static int 3550 ibd_init_txlist(ibd_state_t *state) 3551 { 3552 ibd_swqe_t *swqe; 3553 int i; 3554 3555 for (i = 0; i < state->id_num_swqe; i++) { 3556 if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { 3557 DPRINT(10, "ibd_init_txlist : failed in " 3558 "ibd_alloc_swqe()\n"); 3559 ibd_fini_txlist(state); 3560 return (DDI_FAILURE); 3561 } 3562 3563 /* add to list */ 3564 state->id_tx_list.dl_cnt++; 3565 if (state->id_tx_list.dl_head == NULL) { 3566 swqe->swqe_prev = NULL; 3567 swqe->swqe_next = NULL; 3568 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3569 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3570 } else { 3571 swqe->swqe_prev = state->id_tx_list.dl_tail; 3572 swqe->swqe_next = NULL; 3573 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3574 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3575 } 3576 } 3577 3578 return (DDI_SUCCESS); 3579 } 3580 3581 /* 3582 * Free the statically allocated Tx buffer list. 3583 */ 3584 static void 3585 ibd_fini_txlist(ibd_state_t *state) 3586 { 3587 ibd_swqe_t *node; 3588 3589 mutex_enter(&state->id_tx_list.dl_mutex); 3590 while (state->id_tx_list.dl_head != NULL) { 3591 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3592 state->id_tx_list.dl_head = node->swqe_next; 3593 state->id_tx_list.dl_cnt--; 3594 ASSERT(state->id_tx_list.dl_cnt >= 0); 3595 ibd_free_swqe(state, node); 3596 } 3597 mutex_exit(&state->id_tx_list.dl_mutex); 3598 } 3599 3600 /* 3601 * Allocate a single send wqe and register it so it is almost 3602 * ready to be posted to the hardware. 3603 */ 3604 static int 3605 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) 3606 { 3607 ibt_mr_attr_t mem_attr; 3608 ibd_swqe_t *swqe; 3609 3610 swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); 3611 *wqe = swqe; 3612 swqe->swqe_type = IBD_WQE_SEND; 3613 swqe->swqe_next = NULL; 3614 swqe->swqe_prev = NULL; 3615 swqe->swqe_im_mblk = NULL; 3616 swqe->w_mdtinfo = NULL; 3617 3618 /* alloc copy buffer, must be max size to handle multiple mblk case */ 3619 swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); 3620 3621 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3622 mem_attr.mr_len = state->id_mtu; 3623 mem_attr.mr_as = NULL; 3624 mem_attr.mr_flags = IBT_MR_SLEEP; 3625 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3626 &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != 3627 IBT_SUCCESS) { 3628 DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); 3629 kmem_free(swqe->swqe_copybuf.ic_bufaddr, 3630 state->id_mtu); 3631 kmem_free(swqe, sizeof (ibd_swqe_t)); 3632 return (DDI_FAILURE); 3633 } 3634 3635 swqe->swqe_copybuf.ic_sgl.ds_va = 3636 (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; 3637 swqe->swqe_copybuf.ic_sgl.ds_key = 3638 swqe->swqe_copybuf.ic_mr_desc.md_lkey; 3639 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3640 3641 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3642 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3643 swqe->w_swr.wr_trans = IBT_UD_SRV; 3644 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3645 3646 /* These are set in send */ 3647 swqe->w_swr.wr_nds = 0; 3648 swqe->w_swr.wr_sgl = NULL; 3649 3650 return (DDI_SUCCESS); 3651 } 3652 3653 /* 3654 * Free an allocated send wqe. 3655 */ 3656 static void 3657 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3658 { 3659 3660 if (ibt_deregister_mr(state->id_hca_hdl, 3661 swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3662 DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); 3663 return; 3664 } 3665 kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); 3666 kmem_free(swqe, sizeof (ibd_swqe_t)); 3667 } 3668 3669 /* 3670 * Post a rwqe to the hardware and add it to the Rx list. The 3671 * "recycle" parameter indicates whether an old rwqe is being 3672 * recycled, or this is a new one. 3673 */ 3674 static int 3675 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3676 { 3677 if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != 3678 IBT_SUCCESS) { 3679 DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); 3680 return (DDI_FAILURE); 3681 } 3682 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3683 3684 /* 3685 * Buffers being recycled are already in the list. 3686 */ 3687 if (recycle) 3688 return (DDI_SUCCESS); 3689 3690 mutex_enter(&state->id_rx_list.dl_mutex); 3691 if (state->id_rx_list.dl_head == NULL) { 3692 rwqe->rwqe_prev = NULL; 3693 rwqe->rwqe_next = NULL; 3694 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3695 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3696 } else { 3697 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3698 rwqe->rwqe_next = NULL; 3699 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3700 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3701 } 3702 mutex_exit(&state->id_rx_list.dl_mutex); 3703 3704 return (DDI_SUCCESS); 3705 } 3706 3707 /* 3708 * Allocate the statically allocated Rx buffer list. 3709 */ 3710 static int 3711 ibd_init_rxlist(ibd_state_t *state) 3712 { 3713 ibd_rwqe_t *rwqe; 3714 int i; 3715 3716 for (i = 0; i < state->id_num_rwqe; i++) { 3717 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3718 ibd_fini_rxlist(state); 3719 return (DDI_FAILURE); 3720 } 3721 3722 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3723 ibd_free_rwqe(state, rwqe); 3724 ibd_fini_rxlist(state); 3725 return (DDI_FAILURE); 3726 } 3727 } 3728 3729 return (DDI_SUCCESS); 3730 } 3731 3732 /* 3733 * Free the statically allocated Rx buffer list. 3734 * 3735 */ 3736 static void 3737 ibd_fini_rxlist(ibd_state_t *state) 3738 { 3739 ibd_rwqe_t *node; 3740 3741 mutex_enter(&state->id_rx_list.dl_mutex); 3742 while (state->id_rx_list.dl_head != NULL) { 3743 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3744 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3745 state->id_rx_list.dl_cnt--; 3746 ASSERT(state->id_rx_list.dl_cnt >= 0); 3747 3748 ibd_free_rwqe(state, node); 3749 } 3750 mutex_exit(&state->id_rx_list.dl_mutex); 3751 } 3752 3753 /* 3754 * Allocate a single recv wqe and register it so it is almost 3755 * ready to be posted to the hardware. 3756 */ 3757 static int 3758 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3759 { 3760 ibt_mr_attr_t mem_attr; 3761 ibd_rwqe_t *rwqe; 3762 3763 if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3764 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3765 return (DDI_FAILURE); 3766 } 3767 *wqe = rwqe; 3768 rwqe->rwqe_type = IBD_WQE_RECV; 3769 rwqe->w_state = state; 3770 rwqe->rwqe_next = NULL; 3771 rwqe->rwqe_prev = NULL; 3772 rwqe->w_freeing_wqe = B_FALSE; 3773 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3774 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3775 3776 if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3777 IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { 3778 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); 3779 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3780 return (DDI_FAILURE); 3781 } 3782 3783 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3784 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3785 NULL) { 3786 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3787 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3788 state->id_mtu + IPOIB_GRH_SIZE); 3789 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3790 return (DDI_FAILURE); 3791 } 3792 3793 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3794 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3795 mem_attr.mr_as = NULL; 3796 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3797 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3798 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3799 IBT_SUCCESS) { 3800 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3801 rwqe->w_freeing_wqe = B_TRUE; 3802 freemsg(rwqe->rwqe_im_mblk); 3803 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3804 state->id_mtu + IPOIB_GRH_SIZE); 3805 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3806 return (DDI_FAILURE); 3807 } 3808 3809 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3810 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3811 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3812 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3813 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3814 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3815 rwqe->w_rwr.wr_nds = 1; 3816 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3817 3818 return (DDI_SUCCESS); 3819 } 3820 3821 /* 3822 * Free an allocated recv wqe. 3823 */ 3824 static void 3825 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3826 { 3827 3828 if (ibt_deregister_mr(state->id_hca_hdl, 3829 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3830 DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); 3831 return; 3832 } 3833 3834 /* 3835 * Indicate to the callback function that this rwqe/mblk 3836 * should not be recycled. The freemsg() will invoke 3837 * ibd_freemsg_cb(). 3838 */ 3839 if (rwqe->rwqe_im_mblk != NULL) { 3840 rwqe->w_freeing_wqe = B_TRUE; 3841 freemsg(rwqe->rwqe_im_mblk); 3842 } 3843 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3844 state->id_mtu + IPOIB_GRH_SIZE); 3845 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3846 } 3847 3848 /* 3849 * Delete the rwqe being freed from the rx list. 3850 */ 3851 static void 3852 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3853 { 3854 mutex_enter(&state->id_rx_list.dl_mutex); 3855 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3856 state->id_rx_list.dl_head = rwqe->rwqe_next; 3857 else 3858 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3859 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3860 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3861 else 3862 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3863 mutex_exit(&state->id_rx_list.dl_mutex); 3864 } 3865 3866 /* 3867 * Pre ibt_detach() deconstruction. 3868 */ 3869 static void 3870 ibd_drv_fini(ibd_state_t *state) 3871 { 3872 ib_gid_t mgid; 3873 ibd_mce_t *mce; 3874 ibt_status_t status; 3875 uint8_t jstate; 3876 3877 /* 3878 * Desubscribe from trap notices; we will be tearing down 3879 * the mcg lists soon. Make sure the trap handler does nothing 3880 * even if it is invoked (ie till we invoke ibt_detach()). 3881 */ 3882 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 3883 mutex_enter(&state->id_trap_lock); 3884 state->id_trap_stop = B_TRUE; 3885 while (state->id_trap_inprog > 0) 3886 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 3887 mutex_exit(&state->id_trap_lock); 3888 3889 /* 3890 * Flushing the channel ensures that all pending WQE's 3891 * are marked with flush_error and handed to the CQ. It 3892 * does not guarantee the invocation of the CQ handler. 3893 * This call is guaranteed to return successfully for UD QPNs. 3894 */ 3895 status = ibt_flush_channel(state->id_chnl_hdl); 3896 ASSERT(status == IBT_SUCCESS); 3897 3898 /* 3899 * We possibly need a loop here to wait for all the Tx 3900 * callbacks to happen. The Tx handlers will retrieve 3901 * held resources like AH ac_ref count, registered memory 3902 * and possibly ASYNC_REAP requests. Rx interrupts were already 3903 * turned off (in ibd_detach()); turn off Tx interrupts and 3904 * poll. By the time the polling returns an empty indicator, 3905 * we are sure we have seen all pending Tx callbacks. Note 3906 * that after the ibt_set_cq_handler() returns, the old handler 3907 * is guaranteed not to be invoked anymore. 3908 */ 3909 if (ibd_separate_cqs == 1) 3910 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 3911 ibd_poll_compq(state, state->id_scq_hdl); 3912 3913 /* 3914 * No more async requests will be posted since the device has been 3915 * unregistered; completion handlers have been turned off, so Tx 3916 * handler will not cause any more ASYNC_REAP requests. Queue a 3917 * request for the async thread to exit, which will be serviced 3918 * after any pending ones. This can take a while, specially if the 3919 * SM is unreachable, since IBMF will slowly timeout each SM request 3920 * issued by the async thread. Reap the thread before continuing on, 3921 * we do not want it to be lingering in modunloaded code. 3922 */ 3923 ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); 3924 thread_join(state->id_async_thrid); 3925 3926 /* 3927 * We can not be in promiscuous mode anymore, upper layers 3928 * would have made a request to disable it (if ever set previously) 3929 * before the detach is allowed to progress to this point; and the 3930 * aysnc thread would have processed that request by now. Thus the 3931 * nonmember list is guaranteed empty at this point. 3932 */ 3933 ASSERT(state->id_prom_op != COMPLETED); 3934 3935 /* 3936 * Drop all residual full/non membership. This includes full 3937 * membership to the broadcast group, and any nonmembership 3938 * acquired during transmits. We do this after the Tx completion 3939 * handlers are done, since those might result in some late 3940 * leaves; this also eliminates a potential race with that 3941 * path wrt the mc full list insert/delete. Trap handling 3942 * has also been suppressed at this point. Thus, no locks 3943 * are required while traversing the mc full list. 3944 */ 3945 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 3946 mce = list_head(&state->id_mc_full); 3947 while (mce != NULL) { 3948 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3949 jstate = mce->mc_jstate; 3950 mce = list_next(&state->id_mc_full, mce); 3951 ibd_leave_group(state, mgid, jstate); 3952 } 3953 3954 ibt_free_mcg_info(state->id_mcinfo, 1); 3955 3956 /* 3957 * Kill the channel now; guaranteed to return successfully 3958 * for UD QPNs. 3959 */ 3960 status = ibt_free_channel(state->id_chnl_hdl); 3961 ASSERT(status == IBT_SUCCESS); 3962 3963 /* 3964 * Kill the CQ; all completion handlers are guaranteed to 3965 * have terminated by the time this returns. Since we killed 3966 * the QPN above, we can not receive the IBT_CQ_BUSY error. 3967 */ 3968 status = ibt_free_cq(state->id_rcq_hdl); 3969 ASSERT(status == IBT_SUCCESS); 3970 3971 if (ibd_separate_cqs == 1) { 3972 status = ibt_free_cq(state->id_scq_hdl); 3973 ASSERT(status == IBT_SUCCESS); 3974 } 3975 3976 /* 3977 * We killed the receive interrupts, thus, we will not be 3978 * required to handle received packets anymore. Thus, kill 3979 * service threads since they are not going to be used anymore. 3980 */ 3981 unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); 3982 3983 /* 3984 * Since these following will act on the Rx/Tx list, which 3985 * is also looked at by the Rx/Tx handlers, keep them around 3986 * till all handlers are guaranteed to have completed. 3987 */ 3988 ibd_fini_rxlist(state); 3989 ibd_fini_txlist(state); 3990 3991 /* 3992 * Clean up the active AH hash list. 3993 */ 3994 mod_hash_destroy_hash(state->id_ah_active_hash); 3995 3996 /* 3997 * Free parallel ARP cache and AHs; we are sure all of these 3998 * resources have been released by the Tx completion handler. 3999 */ 4000 ibd_acache_fini(state); 4001 4002 /* 4003 * We freed the QPN, all the MRs and AHs. This step should not 4004 * fail; print a warning message if it does fail, due to a bug 4005 * in the driver. 4006 */ 4007 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 4008 ibd_print_warn(state, "failed to free protection domain"); 4009 4010 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 4011 ibd_print_warn(state, "failed to close HCA device"); 4012 } 4013 4014 /* 4015 * IBA Rx/Tx completion queue handler. Guaranteed to be single 4016 * threaded and nonreentrant for this CQ. When using combined CQ, 4017 * this handles Tx and Rx completions. With separate CQs, this handles 4018 * only Rx completions. 4019 */ 4020 /* ARGSUSED */ 4021 static void 4022 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4023 { 4024 ibd_state_t *state = (ibd_state_t *)arg; 4025 4026 atomic_add_64(&state->id_num_intrs, 1); 4027 (void) gld_intr(state->id_macinfo); 4028 } 4029 4030 /* 4031 * Separate CQ handler for Tx completions, when the Tx CQ is in 4032 * interrupt driven mode. 4033 */ 4034 /* ARGSUSED */ 4035 static void 4036 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4037 { 4038 ibd_state_t *state = (ibd_state_t *)arg; 4039 4040 atomic_add_64(&state->id_num_intrs, 1); 4041 4042 /* 4043 * Poll for completed entries; the CQ will not interrupt any 4044 * more for completed packets. 4045 */ 4046 ibd_poll_compq(state, state->id_scq_hdl); 4047 4048 /* 4049 * Now enable CQ notifications; all completions originating now 4050 * will cause new interrupts. 4051 */ 4052 if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != 4053 IBT_SUCCESS) { 4054 /* 4055 * We do not expect a failure here. 4056 */ 4057 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 4058 } 4059 4060 /* 4061 * Repoll to catch all packets that might have completed after 4062 * we finished the first poll loop and before interrupts got 4063 * armed. 4064 */ 4065 ibd_poll_compq(state, state->id_scq_hdl); 4066 } 4067 4068 /* 4069 * Multicast group create/delete trap handler. These will be delivered 4070 * on a kernel thread (handling can thus block) and can be invoked 4071 * concurrently. The handler can be invoked anytime after it is 4072 * registered and before ibt_detach(). 4073 */ 4074 /* ARGSUSED */ 4075 static void 4076 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4077 ibt_subnet_event_t *event) 4078 { 4079 ibd_state_t *state = (ibd_state_t *)arg; 4080 ibd_req_t *req; 4081 4082 /* 4083 * The trap handler will get invoked once for every event for 4084 * evert port. The input "gid" is the GID0 of the port the 4085 * trap came in on; we just need to act on traps that came 4086 * to our port, meaning the port on which the ipoib interface 4087 * resides. Since ipoib uses GID0 of the port, we just match 4088 * the gids to check whether we need to handle the trap. 4089 */ 4090 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4091 return; 4092 4093 DPRINT(10, "ibd_notices_handler : %d\n", code); 4094 4095 switch (code) { 4096 case IBT_SM_EVENT_UNAVAILABLE: 4097 /* 4098 * If we are in promiscuous mode or have 4099 * sendnonmembers, we need to print a warning 4100 * message right now. Else, just store the 4101 * information, print when we enter promiscuous 4102 * mode or attempt nonmember send. We might 4103 * also want to stop caching sendnonmember. 4104 */ 4105 ibd_print_warn(state, "IBA multicast support " 4106 "degraded due to unavailability of multicast " 4107 "traps"); 4108 break; 4109 case IBT_SM_EVENT_AVAILABLE: 4110 /* 4111 * If we printed a warning message above or 4112 * while trying to nonmember send or get into 4113 * promiscuous mode, print an okay message. 4114 */ 4115 ibd_print_warn(state, "IBA multicast support " 4116 "restored due to availability of multicast " 4117 "traps"); 4118 break; 4119 case IBT_SM_EVENT_MCG_CREATED: 4120 case IBT_SM_EVENT_MCG_DELETED: 4121 /* 4122 * Common processing of creation/deletion traps. 4123 * First check if the instance is being 4124 * [de]initialized; back off then, without doing 4125 * anything more, since we are not sure if the 4126 * async thread is around, or whether we might 4127 * be racing with the detach code in ibd_drv_fini() 4128 * that scans the mcg list. 4129 */ 4130 if (!ibd_async_safe(state)) 4131 return; 4132 4133 req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); 4134 req->rq_gid = event->sm_notice_gid; 4135 req->rq_ptr = (void *)code; 4136 ibd_queue_work_slot(state, req, ASYNC_TRAP); 4137 break; 4138 } 4139 } 4140 4141 static void 4142 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4143 { 4144 ib_gid_t mgid = req->rq_gid; 4145 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4146 4147 DPRINT(10, "ibd_async_trap : %d\n", code); 4148 4149 /* 4150 * Atomically search the nonmember and sendonlymember lists and 4151 * delete. 4152 */ 4153 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4154 4155 if (state->id_prom_op == COMPLETED) { 4156 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4157 4158 /* 4159 * If in promiscuous mode, try to join/attach to the new 4160 * mcg. Given the unreliable out-of-order mode of trap 4161 * delivery, we can never be sure whether it is a problem 4162 * if the join fails. Thus, we warn the admin of a failure 4163 * if this was a creation trap. Note that the trap might 4164 * actually be reporting a long past event, and the mcg 4165 * might already have been deleted, thus we might be warning 4166 * in vain. 4167 */ 4168 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4169 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4170 ibd_print_warn(state, "IBA promiscuous mode missed " 4171 "new multicast gid %016llx:%016llx", 4172 (u_longlong_t)mgid.gid_prefix, 4173 (u_longlong_t)mgid.gid_guid); 4174 } 4175 4176 /* 4177 * Free the request slot allocated by the subnet event thread. 4178 */ 4179 kmem_free(req, sizeof (ibd_req_t)); 4180 4181 ibd_async_done(state); 4182 } 4183 4184 /* 4185 * GLD entry point to reset hardware. 4186 */ 4187 /* ARGSUSED */ 4188 static int 4189 ibd_reset(gld_mac_info_t *macinfo) 4190 { 4191 /* 4192 * This will be invoked from Style 1 open() and Style 2 4193 * attach() routines, ie just before the interface starts 4194 * getting used. 4195 */ 4196 return (GLD_SUCCESS); 4197 } 4198 4199 /* 4200 * GLD entry point to start hardware. 4201 */ 4202 /* ARGSUSED */ 4203 static int 4204 ibd_start(gld_mac_info_t *macinfo) 4205 { 4206 return (GLD_SUCCESS); 4207 } 4208 4209 /* 4210 * GLD entry point to stop hardware from receiving packets. 4211 */ 4212 /* ARGSUSED */ 4213 static int 4214 ibd_stop(gld_mac_info_t *macinfo) 4215 { 4216 #ifdef RUN_PERFORMANCE 4217 ibd_perf((ibd_state_t *)macinfo->gldm_private); 4218 #endif 4219 return (GLD_SUCCESS); 4220 } 4221 4222 /* 4223 * GLD entry point to modify device's mac address. We do not 4224 * allow address modifications. 4225 */ 4226 static int 4227 ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr) 4228 { 4229 ibd_state_t *state; 4230 4231 state = (ibd_state_t *)macinfo->gldm_private; 4232 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4233 return (GLD_SUCCESS); 4234 else 4235 return (GLD_FAILURE); 4236 } 4237 4238 /* 4239 * The blocking part of the IBA join/leave operations are done out 4240 * of here on the async thread. 4241 */ 4242 static void 4243 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4244 { 4245 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4246 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4247 4248 if (op == ASYNC_JOIN) { 4249 int ret = ERRORED; 4250 4251 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL) 4252 ret = COMPLETED; 4253 4254 state->id_multi_op = ret; 4255 } else { 4256 /* 4257 * Here, we must search for the proper mcg_info and 4258 * use that to leave the group. 4259 */ 4260 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4261 } 4262 } 4263 4264 /* 4265 * GLD entry point for multicast enable/disable requests. 4266 * Invoked by GLD only on the first multicast enable for a specific 4267 * address (GLD is free to retry ocassionally if we return RETRY), 4268 * and on last disable of the same address. Just queue the operation 4269 * to the async thread. 4270 */ 4271 static int 4272 ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) 4273 { 4274 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4275 ipoib_mac_t *mcast; 4276 ib_gid_t mgid; 4277 ib_qpn_t mcqpn; 4278 int ret; 4279 4280 /* 4281 * The incoming multicast address might not be aligned properly 4282 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4283 * it to look like one though, to get the offsets of the mc gid, 4284 * since we know we are not going to dereference any values with 4285 * the ipoib_mac_t pointer. 4286 */ 4287 mcast = (ipoib_mac_t *)mcmac; 4288 4289 /* 4290 * Check validity of MCG address. We could additionally check 4291 * that a enable/disable is not being issued on the "broadcast" 4292 * mcg, but since this operation is only invokable by priviledged 4293 * programs anyway, we allow the flexibility to those dlpi apps. 4294 * Note that we do not validate the "scope" of the IBA mcg. 4295 */ 4296 bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t)); 4297 if (mcqpn != htonl(IB_MC_QPN)) 4298 return (GLD_FAILURE); 4299 4300 /* 4301 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4302 * nothing (ie we stay JOINed to the broadcast group done in 4303 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4304 * requires to be joined to broadcast groups at all times. 4305 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4306 * depends on this. 4307 */ 4308 if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr, 4309 IPOIB_ADDRL) == 0) 4310 return (GLD_SUCCESS); 4311 4312 ibd_n2h_gid(mcast, &mgid); 4313 4314 if (op == GLD_MULTI_ENABLE) { 4315 DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n", 4316 mgid.gid_prefix, mgid.gid_guid); 4317 ret = GLD_RETRY; 4318 mutex_enter(&state->id_mc_mutex); 4319 if (state->id_multi_op == NOTSTARTED) { 4320 state->id_multi_req.rq_gid = mgid; 4321 ibd_queue_work_slot(state, &state->id_multi_req, 4322 ASYNC_JOIN); 4323 state->id_multi_op = ONGOING; 4324 bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL); 4325 } else if (bcmp(&state->id_multi_addr, mcast, 4326 IPOIB_ADDRL) == 0) { 4327 if (state->id_multi_op != ONGOING) { 4328 if (state->id_multi_op == COMPLETED) 4329 ret = GLD_SUCCESS; 4330 else if (state->id_multi_op == ERRORED) 4331 ret = GLD_FAILURE; 4332 if (state->id_multi_queued) { 4333 state->id_multi_queued = B_FALSE; 4334 ibd_queue_work_slot(state, 4335 &state->id_multi_req, ASYNC_POKE); 4336 } else { 4337 state->id_multi_op = NOTSTARTED; 4338 } 4339 } 4340 } else { 4341 /* 4342 * Hmmm, a set was tried on another mcg. We 4343 * need to make sure to gld_sched for this 4344 * stream to retry once the ongoing one terminates. 4345 * The gld_sched out of the async thread on completion 4346 * of the mcg join is not enough; because the queued 4347 * stream might come in and get a RETRY again because 4348 * the mcg join result has still not been reaped by 4349 * the originator. If gld_sched ensured that streams 4350 * get tried in the order they received RETRYs, things 4351 * would be simpler. 4352 */ 4353 state->id_multi_queued = B_TRUE; 4354 } 4355 mutex_exit(&state->id_mc_mutex); 4356 } else { 4357 ibd_mce_t *mce; 4358 DPRINT(1, "ibd_set_multicast : unset_multicast : " 4359 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4360 ret = GLD_SUCCESS; 4361 mutex_enter(&state->id_mc_mutex); 4362 mce = IBD_MCACHE_FIND_FULL(state, mgid); 4363 mutex_exit(&state->id_mc_mutex); 4364 /* 4365 * GLD should not have invoked us unless the mcg was 4366 * added in the past. 4367 */ 4368 ASSERT(mce != NULL); 4369 ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0); 4370 ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE); 4371 } 4372 return (ret); 4373 } 4374 4375 /* 4376 * The blocking part of the IBA promiscuous operations are done 4377 * out of here on the async thread. The dlpireq parameter indicates 4378 * whether this invocation is due to a dlpi request or due to 4379 * a port up/down event. 4380 */ 4381 static void 4382 ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq) 4383 { 4384 ibd_mce_t *mce = list_head(&state->id_mc_non); 4385 ib_gid_t mgid; 4386 4387 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4388 4389 /* 4390 * Mark the request slot as empty and reusable for the 4391 * next promiscuous set request. 4392 */ 4393 if (dlpireq) 4394 state->id_prom_op = NOTSTARTED; 4395 4396 while (mce != NULL) { 4397 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4398 mce = list_next(&state->id_mc_non, mce); 4399 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4400 } 4401 } 4402 4403 /* 4404 * The blocking part of the IBA promiscuous operations are done 4405 * out of here on the async thread. The dlpireq parameter indicates 4406 * whether this invocation is due to a dlpi request or due to 4407 * a port up/down event. 4408 */ 4409 static void 4410 ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) 4411 { 4412 ibt_mcg_attr_t mcg_attr; 4413 ibt_mcg_info_t *mcg_info; 4414 ib_gid_t mgid; 4415 uint_t numg; 4416 int i; 4417 4418 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4419 4420 /* 4421 * Obtain all active MC groups on the IB fabric with 4422 * specified criteria (scope + Pkey + Qkey + mtu). 4423 */ 4424 bzero(&mcg_attr, sizeof (mcg_attr)); 4425 mcg_attr.mc_pkey = state->id_pkey; 4426 mcg_attr.mc_scope = state->id_scope; 4427 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4428 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4429 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4430 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4431 IBT_SUCCESS) { 4432 ibd_print_warn(state, "Could not get list of IBA multicast " 4433 "groups"); 4434 if (dlpireq) 4435 state->id_prom_op = ERRORED; 4436 return; 4437 } 4438 4439 /* 4440 * Iterate over the returned mcg's and join as NonMember 4441 * to the IP mcg's. 4442 */ 4443 for (i = 0; i < numg; i++) { 4444 /* 4445 * Do a NonMember JOIN on the MC group. 4446 */ 4447 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4448 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4449 ibd_print_warn(state, "IBA promiscuous mode missed " 4450 "multicast gid %016llx:%016llx", 4451 (u_longlong_t)mgid.gid_prefix, 4452 (u_longlong_t)mgid.gid_guid); 4453 } 4454 4455 ibt_free_mcg_info(mcg_info, numg); 4456 if (dlpireq) 4457 state->id_prom_op = COMPLETED; 4458 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4459 } 4460 4461 /* 4462 * GLD entry point for multicast promiscuous enable/disable requests. 4463 * GLD assumes phys state receives more packets than multi state, 4464 * which is not true for IPoIB. Thus, treat the multi and phys 4465 * promiscuous states the same way to work with GLD's assumption. 4466 */ 4467 static int 4468 ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode) 4469 { 4470 ibd_state_t *state; 4471 int ret; 4472 4473 state = (ibd_state_t *)macinfo->gldm_private; 4474 switch (mode) { 4475 case GLD_MAC_PROMISC_PHYS: 4476 case GLD_MAC_PROMISC_MULTI: 4477 DPRINT(1, "ibd_set_promiscuous : set_promisc : %d", 4478 mode); 4479 /* 4480 * Look at gld: this might be getting 4481 * called because someone is turning off 4482 * prom_phys. Nothing needs to be done in 4483 * that case. 4484 */ 4485 ret = GLD_RETRY; 4486 mutex_enter(&state->id_mc_mutex); 4487 switch (state->id_prom_op) { 4488 case NOTSTARTED: 4489 ibd_queue_work_slot(state, 4490 &state->id_prom_req, ASYNC_PROMON); 4491 state->id_prom_op = ONGOING; 4492 break; 4493 case COMPLETED: 4494 ret = GLD_SUCCESS; 4495 break; 4496 case ERRORED: 4497 state->id_prom_op = NOTSTARTED; 4498 ret = GLD_FAILURE; 4499 } 4500 /* 4501 * Else in the ONGOING case, nothing special 4502 * needs to be done; the async thread will poke 4503 * all streams. A prior set, or the last unset 4504 * request is still in the async queue. 4505 */ 4506 mutex_exit(&state->id_mc_mutex); 4507 return (ret); 4508 case GLD_MAC_PROMISC_NONE: 4509 DPRINT(1, "ibd_set_promiscuous : unset_promisc"); 4510 /* 4511 * Look at gld: this might be getting 4512 * called because someone is turning off 4513 * prom_phys or prom_multi. Mark operation 4514 * as ongoing, to prevent a subsequent set 4515 * operation from using the request slot 4516 * unless the async thread is ready to give 4517 * it up. The async thread will mark the 4518 * request slot as usable as soon as it 4519 * starts doing the unset operation. 4520 */ 4521 ASSERT(state->id_prom_op == COMPLETED); 4522 state->id_prom_op = ONGOING; 4523 ibd_queue_work_slot(state, &state->id_prom_req, 4524 ASYNC_PROMOFF); 4525 return (GLD_SUCCESS); 4526 default: 4527 return (GLD_NOTSUPPORTED); 4528 } 4529 } 4530 4531 /* 4532 * GLD entry point for gathering statistics. 4533 */ 4534 static int 4535 ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp) 4536 { 4537 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4538 4539 sp->glds_errrcv = 0; 4540 sp->glds_underflow = 0; 4541 sp->glds_missed = 0; 4542 4543 sp->glds_overflow = state->id_tx_short; /* Tx overflow */ 4544 sp->glds_speed = state->id_link_speed; 4545 sp->glds_media = GLDM_IB; 4546 sp->glds_errxmt = state->id_ah_error; /* failed AH translation */ 4547 sp->glds_norcvbuf = state->id_rx_short; /* # times below water mark */ 4548 sp->glds_intr = state->id_num_intrs; /* number of intrs */ 4549 4550 return (GLD_SUCCESS); 4551 } 4552 4553 /* 4554 * Arrange for a Tx request that is failing, or has already failed due to 4555 * Tx descriptor shortage to be retried soon. Used mostly with poll based 4556 * Tx completion, since gld_sched() can not be invoked in ibd_send() context 4557 * due to potential single processor deadlock (when the ibd_send() is 4558 * caused by gld_recv()). 4559 */ 4560 static void 4561 ibd_tx_sched(ibd_state_t *state) 4562 { 4563 mutex_enter(&state->id_sched_lock); 4564 /* 4565 * If a sched request is already enqueued, do not try to do 4566 * that again, since the async work request list would get 4567 * corrupted. 4568 */ 4569 if (!state->id_sched_queued) { 4570 state->id_sched_queued = B_TRUE; 4571 ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED); 4572 } 4573 mutex_exit(&state->id_sched_lock); 4574 } 4575 4576 /* 4577 * The gld_sched() in ibd_async_work() does the work for us. 4578 */ 4579 static void 4580 ibd_async_txsched(ibd_state_t *state) 4581 { 4582 mutex_enter(&state->id_sched_lock); 4583 state->id_sched_queued = B_FALSE; 4584 mutex_exit(&state->id_sched_lock); 4585 } 4586 4587 /* 4588 * Release one or more chained send wqes back into free list. 4589 */ 4590 static void 4591 ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe, 4592 boolean_t send_context) 4593 { 4594 boolean_t call_gld_sched = B_FALSE; 4595 4596 /* 4597 * Add back on Tx list for reuse. 4598 */ 4599 lswqe->swqe_next = NULL; 4600 mutex_enter(&state->id_tx_list.dl_mutex); 4601 if (state->id_tx_list.dl_pending_sends) { 4602 state->id_tx_list.dl_pending_sends = B_FALSE; 4603 call_gld_sched = B_TRUE; 4604 } 4605 if (state->id_tx_list.dl_head == NULL) { 4606 state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe); 4607 } else { 4608 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe); 4609 } 4610 state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe); 4611 mutex_exit(&state->id_tx_list.dl_mutex); 4612 4613 /* 4614 * See comments in ibd_tx_sched(); make sure not to call 4615 * gld_sched() if we are in ibd_send() context. 4616 */ 4617 if (call_gld_sched) 4618 if ((ibd_txcomp_poll == 0) && (!send_context)) 4619 gld_sched(state->id_macinfo); 4620 else 4621 ibd_tx_sched(state); 4622 } 4623 4624 /* 4625 * Acquire a number of chained send wqe's from the free list. Returns the 4626 * number of wqe's actually allocated, and pointers to the first and last 4627 * in the chain. 4628 */ 4629 static int 4630 ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, 4631 int number) 4632 { 4633 int numwqe = number; 4634 ibd_swqe_t *node, *wqes; 4635 4636 /* 4637 * Check and reclaim some of the completed Tx requests. 4638 * If someone else is already in this code and pulling Tx 4639 * completions, no need to poll, since the current lock holder 4640 * will do the work anyway. Normally, we poll for completions 4641 * every few Tx attempts, but if we are short on Tx descriptors, 4642 * we always try to poll. 4643 */ 4644 if ((ibd_txcomp_poll == 1) && 4645 (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) == 4646 0) || state->id_tx_list.dl_pending_sends) && 4647 (mutex_tryenter(&state->id_txcomp_lock) != 0)) { 4648 DPRINT(10, "ibd_send : polling"); 4649 ibd_poll_compq(state, state->id_scq_hdl); 4650 mutex_exit(&state->id_txcomp_lock); 4651 } 4652 4653 /* 4654 * Grab required transmit wqes. 4655 */ 4656 mutex_enter(&state->id_tx_list.dl_mutex); 4657 node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head); 4658 while ((node != NULL) && (numwqe-- > 1)) 4659 node = WQE_TO_SWQE(node->swqe_next); 4660 4661 /* 4662 * If we did not find the number we were looking for, flag no resource. 4663 * Adjust list appropriately in either case. 4664 */ 4665 if (numwqe != 0) { 4666 state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL; 4667 state->id_tx_list.dl_pending_sends = B_TRUE; 4668 mutex_exit(&state->id_tx_list.dl_mutex); 4669 DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); 4670 atomic_add_64(&state->id_tx_short, 1); 4671 if (ibd_txcomp_poll == 1) { 4672 /* 4673 * Arrange for a future gld_sched(). Note that when 4674 * the Tx is retried after a little bit, it will 4675 * surely poll the completion queue above. 4676 */ 4677 ibd_tx_sched(state); 4678 } 4679 } else { 4680 state->id_tx_list.dl_head = node->swqe_next; 4681 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node)) 4682 state->id_tx_list.dl_tail = NULL; 4683 mutex_exit(&state->id_tx_list.dl_mutex); 4684 } 4685 4686 /* 4687 * Set return parameters. 4688 */ 4689 *fswqe = wqes; 4690 *lswqe = node; 4691 return (number - numwqe); 4692 } 4693 4694 typedef struct ibd_mpack_s { 4695 ibd_swqe_t *ip_swqe; 4696 uint32_t ip_start, ip_stuff, ip_flags; 4697 ibd_ace_t *ip_ace; 4698 boolean_t ip_copy; 4699 boolean_t ip_noresources; 4700 int ip_segs; 4701 ibt_mr_hdl_t ip_mhdl[IBD_MDTMAX_SEGS + 1]; 4702 ibt_mr_desc_t ip_mdsc[IBD_MDTMAX_SEGS + 1]; 4703 } ibd_mpack_t; 4704 _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s)) 4705 4706 static void 4707 ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info) 4708 { 4709 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4710 ibd_mpack_t *ptx = (ibd_mpack_t *)cookie; 4711 ibd_ace_t *ace = ptx->ip_ace; 4712 ibd_swqe_t *wqes, *node = ptx->ip_swqe; 4713 boolean_t docopy = ptx->ip_copy; 4714 uchar_t *pptr; 4715 int i, pktsize, seglen, seg = 0; 4716 4717 /* 4718 * Snag the next wqe before we post this one, since it could complete 4719 * very fast and the wqe could get put at the end of the list, 4720 * corrupting our chain. Set up for the next packet. 4721 */ 4722 wqes = WQE_TO_SWQE(node->swqe_next); 4723 ptx->ip_swqe = wqes; 4724 4725 IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff, 4726 ptx->ip_flags); 4727 node->w_ahandle = ace; 4728 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4729 4730 if (docopy) { 4731 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 4732 pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 4733 pktsize = seglen = PDESC_HDRL(dl_pkt_info); 4734 if (seglen > 0) { 4735 bcopy(dl_pkt_info->hdr_rptr, pptr, seglen); 4736 pptr += seglen; 4737 } 4738 for (; seg < dl_pkt_info->pld_cnt; seg++) 4739 if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) { 4740 bcopy(dl_pkt_info->pld_ary[seg].pld_rptr, 4741 pptr, seglen); 4742 pptr += seglen; 4743 pktsize += seglen; 4744 } 4745 node->w_swr.wr_nds = 1; 4746 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 4747 } else { 4748 seglen = PDESC_HDRL(dl_pkt_info); 4749 if (seglen > 0) { 4750 node->w_smblk_sgl[seg].ds_va = 4751 (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr; 4752 node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey; 4753 node->w_smblk_sgl[seg].ds_len = seglen; 4754 seg++; 4755 } 4756 for (i = 0; i < dl_pkt_info->pld_cnt; i++) { 4757 if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) { 4758 node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t) 4759 (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr; 4760 node->w_smblk_sgl[seg].ds_key = 4761 ptx->ip_mdsc[dl_pkt_info-> 4762 pld_ary[i].pld_pbuf_idx + 1].md_lkey; 4763 node->w_smblk_sgl[seg].ds_len = seglen; 4764 seg++; 4765 } 4766 } 4767 node->w_swr.wr_sgl = node->w_smblk_sgl; 4768 node->w_swr.wr_nds = seg; 4769 } 4770 4771 if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) != 4772 IBT_SUCCESS) { 4773 /* 4774 * We never expect a failure here. But handle it, just in case. 4775 * If this is not the last packet, there are no problems; if 4776 * it is the last packet and the previous ones have not been 4777 * transmitted yet by the hardware, in the registration case, 4778 * the hardware might transmit garbage since we will be 4779 * freemsg'ing. The AH is still safe. 4780 */ 4781 DPRINT(5, "ibd_mdt_txone: posting failed"); 4782 ibd_tx_cleanup(state, node, B_TRUE); 4783 } 4784 } 4785 4786 static int 4787 ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie) 4788 { 4789 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4790 multidata_t *dlmdp = mmd_getmultidata(mp); 4791 ibd_mpack_t *mdinfo; 4792 mbufinfo_t bufinfo, *binfo = &bufinfo; 4793 pattrinfo_t attr_info; 4794 uchar_t *dlap; 4795 ibt_mr_attr_t mem_attr; 4796 ibd_swqe_t *wqes, *node; 4797 ipoib_mac_t *dest; 4798 size_t hsize, psize = 0; 4799 int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL); 4800 int i, ret; 4801 uint32_t end, value; 4802 boolean_t noresources = B_FALSE; 4803 4804 ASSERT(DB_TYPE(mp) == M_MULTIDATA); 4805 ASSERT(mp->b_cont == NULL); 4806 4807 if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0) 4808 return (0); 4809 else if (numwqes != numpackets) 4810 noresources = B_TRUE; 4811 4812 DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node); 4813 4814 /* 4815 * Allocate the cookie that will be passed to subsequent packet 4816 * transmit and post_mdt calls by GLD. We can not sleep, so if 4817 * there is no memory, just tell GLD to drop the entire MDT message. 4818 */ 4819 if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) { 4820 ibd_release_swqes(state, wqes, node, B_TRUE); 4821 return (-1); 4822 } 4823 *cookie = (void *)mdinfo; 4824 mdinfo->ip_noresources = noresources; 4825 4826 /* 4827 * Walk Global Attributes. If TCP failed to provide destination 4828 * information, or some interposing module removed the information, 4829 * fail the entire message. 4830 */ 4831 attr_info.type = PATTR_DSTADDRSAP; 4832 if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) { 4833 ibd_release_swqes(state, wqes, node, B_TRUE); 4834 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4835 return (-1); 4836 } 4837 dlap = ((pattr_addr_t *)attr_info.buf)->addr; 4838 dest = (ipoib_mac_t *)dlap; 4839 4840 /* 4841 * Get the AH for this destination, incrementing the posted 4842 * reference count properly. 4843 */ 4844 if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret, 4845 numwqes)) == NULL) { 4846 ibd_release_swqes(state, wqes, node, B_TRUE); 4847 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4848 return ((ret == GLD_FAILURE) ? -1 : 0); 4849 } 4850 4851 /* 4852 * Depending on how costly it is to copy vs register, we try to 4853 * register, falling back on copying if we fail. 4854 */ 4855 mmd_getregions(dlmdp, &bufinfo); 4856 hsize = binfo->hbuf_wptr - binfo->hbuf_rptr; 4857 for (i = 0; i < binfo->pbuf_cnt; i++) 4858 psize += (binfo->pbuf_ary[i].pbuf_wptr - 4859 binfo->pbuf_ary[i].pbuf_rptr); 4860 if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) { 4861 mdinfo->ip_segs = i + 1; 4862 if (hsize != 0) { 4863 mem_attr.mr_as = NULL; 4864 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4865 mem_attr.mr_vaddr = 4866 (uint64_t)(uintptr_t)binfo->hbuf_rptr; 4867 mem_attr.mr_len = hsize; 4868 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 4869 &mem_attr, &mdinfo->ip_mhdl[0], 4870 &mdinfo->ip_mdsc[0]) != IBT_SUCCESS) 4871 goto ibd_mdt_copy; 4872 DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize); 4873 } 4874 for (i = 0; i < binfo->pbuf_cnt; i++) { 4875 if ((psize = (binfo->pbuf_ary[i].pbuf_wptr - 4876 binfo->pbuf_ary[i].pbuf_rptr)) != 0) { 4877 mem_attr.mr_as = NULL; 4878 mem_attr.mr_flags = IBT_MR_NOSLEEP; 4879 mem_attr.mr_vaddr = (uint64_t)(uintptr_t) 4880 binfo->pbuf_ary[i].pbuf_rptr; 4881 mem_attr.mr_len = psize; 4882 if (ibt_register_mr(state->id_hca_hdl, 4883 state->id_pd_hdl, &mem_attr, 4884 &mdinfo->ip_mhdl[i + 1], 4885 &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) { 4886 for (; i >= 0; i--) { 4887 (void) ibt_deregister_mr( 4888 state->id_hca_hdl, 4889 mdinfo->ip_mhdl[i]); 4890 } 4891 goto ibd_mdt_copy; 4892 } 4893 DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize); 4894 } 4895 } 4896 4897 mdinfo->ip_copy = B_FALSE; 4898 4899 /* 4900 * All the deregistration must happen once the last swqe 4901 * completes. 4902 */ 4903 node->swqe_im_mblk = mp; 4904 node->w_mdtinfo = mdinfo; 4905 DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node); 4906 } else { 4907 ibd_mdt_copy: 4908 mdinfo->ip_copy = B_TRUE; 4909 } 4910 4911 /* 4912 * Do checksum related work. 4913 */ 4914 IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff, 4915 &end, &value, &mdinfo->ip_flags); 4916 4917 mdinfo->ip_swqe = wqes; 4918 return (numwqes); 4919 } 4920 4921 /* ARGSUSED */ 4922 static void 4923 ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie) 4924 { 4925 ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie; 4926 4927 if (mdinfo->ip_copy) { 4928 if (!mdinfo->ip_noresources) 4929 freemsg(mp); 4930 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 4931 } 4932 } 4933 4934 /* 4935 * GLD entry point for transmitting a datagram. 4936 * The passed in packet has this format: 4937 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 4938 */ 4939 static int 4940 ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) 4941 { 4942 ibt_status_t ibt_status; 4943 ibt_mr_attr_t mem_attr; 4944 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 4945 ibd_ace_t *ace; 4946 ibd_swqe_t *node; 4947 ipoib_mac_t *dest; 4948 ipoib_ptxhdr_t *ipibp; 4949 ip6_t *ip6h; 4950 mblk_t *nmp = mp; 4951 uint_t pktsize; 4952 size_t blksize; 4953 uchar_t *bufp; 4954 int i, ret, len, nmblks = 1; 4955 boolean_t dofree = B_TRUE; 4956 4957 if (ibd_acquire_swqes(state, &node, &node, 1) == 0) 4958 return (GLD_NORESOURCES); 4959 4960 /* 4961 * Obtain an address handle for the destination. 4962 */ 4963 dest = (ipoib_mac_t *)mp->b_rptr; 4964 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 4965 node->w_ahandle = ace; 4966 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 4967 } else { 4968 DPRINT(5, 4969 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 4970 ((ret == GLD_FAILURE) ? "failed" : "queued"), 4971 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 4972 htonl(dest->ipoib_gidpref[1]), 4973 htonl(dest->ipoib_gidsuff[0]), 4974 htonl(dest->ipoib_gidsuff[1])); 4975 node->w_ahandle = NULL; 4976 goto ibd_send_fail; 4977 } 4978 4979 /* 4980 * For ND6 packets, padding is at the front of the source lladdr. 4981 * Insert the padding at front. 4982 */ 4983 ipibp = (ipoib_ptxhdr_t *)mp->b_rptr; 4984 if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) { 4985 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) { 4986 if (!pullupmsg(mp, IPV6_HDR_LEN + 4987 sizeof (ipoib_ptxhdr_t))) { 4988 DPRINT(10, "ibd_send: pullupmsg failure "); 4989 ret = GLD_FAILURE; 4990 goto ibd_send_fail; 4991 } 4992 } 4993 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t)); 4994 len = ntohs(ip6h->ip6_plen); 4995 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 4996 if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + 4997 IPV6_HDR_LEN + len) { 4998 if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) + 4999 IPV6_HDR_LEN + len)) { 5000 DPRINT(10, "ibd_send: pullupmsg " 5001 "failure "); 5002 ret = GLD_FAILURE; 5003 goto ibd_send_fail; 5004 } 5005 } 5006 /* LINTED: E_CONSTANT_CONDITION */ 5007 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5008 } 5009 } 5010 5011 mp->b_rptr += IPOIB_ADDRL; 5012 while (((nmp = nmp->b_cont) != NULL) && 5013 (++nmblks < (state->id_max_sqseg + 1))) 5014 ; 5015 pktsize = msgsize(mp); 5016 if (pktsize > state->id_mtu) { 5017 ret = GLD_BADARG; 5018 goto ibd_send_fail; 5019 } 5020 5021 /* 5022 * Do checksum related work. 5023 */ 5024 IBD_CKSUM_SEND(mp); 5025 5026 /* 5027 * Copy the data to preregistered buffers, or register the buffer. 5028 */ 5029 if ((nmblks <= state->id_max_sqseg) && 5030 (pktsize > IBD_TX_COPY_THRESHOLD)) { 5031 for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { 5032 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; 5033 mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; 5034 mem_attr.mr_as = NULL; 5035 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5036 ibt_status = ibt_register_mr(state->id_hca_hdl, 5037 state->id_pd_hdl, &mem_attr, 5038 &node->w_smblkbuf[i].im_mr_hdl, 5039 &node->w_smblkbuf[i].im_mr_desc); 5040 if (ibt_status != IBT_SUCCESS) { 5041 /* 5042 * We do not expect any error other than 5043 * IBT_INSUFF_RESOURCE. 5044 */ 5045 if (ibt_status != IBT_INSUFF_RESOURCE) 5046 DPRINT(10, "ibd_send:%d\n", 5047 "failed in ibt_register_mem()", 5048 ibt_status); 5049 DPRINT(5, "ibd_send: registration failed"); 5050 node->w_swr.wr_nds = i; 5051 /* 5052 * Deregister already registered memory; 5053 * fallback to copying the mblk. 5054 */ 5055 ibd_deregister_mr(state, node); 5056 goto ibd_copy_path; 5057 } 5058 node->w_smblk_sgl[i].ds_va = 5059 (ib_vaddr_t)(uintptr_t)nmp->b_rptr; 5060 node->w_smblk_sgl[i].ds_key = 5061 node->w_smblkbuf[i].im_mr_desc.md_lkey; 5062 node->w_smblk_sgl[i].ds_len = 5063 nmp->b_wptr - nmp->b_rptr; 5064 } 5065 node->swqe_im_mblk = mp; 5066 node->w_swr.wr_sgl = node->w_smblk_sgl; 5067 node->w_swr.wr_nds = nmblks; 5068 dofree = B_FALSE; 5069 } else { 5070 ibd_copy_path: 5071 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5072 node->w_swr.wr_nds = 1; 5073 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5074 5075 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5076 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 5077 blksize = MBLKL(nmp); 5078 bcopy(nmp->b_rptr, bufp, blksize); 5079 bufp += blksize; 5080 } 5081 } 5082 5083 /* 5084 * Queue the wqe to hardware. 5085 */ 5086 ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); 5087 if (ibt_status != IBT_SUCCESS) { 5088 /* 5089 * We should not fail here; but just in case we do, we 5090 * tell GLD about this error. 5091 */ 5092 ret = GLD_FAILURE; 5093 DPRINT(5, "ibd_send: posting failed"); 5094 goto ibd_send_fail; 5095 } 5096 5097 DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", 5098 INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), 5099 htonl(ace->ac_mac.ipoib_gidpref[0]), 5100 htonl(ace->ac_mac.ipoib_gidpref[1]), 5101 htonl(ace->ac_mac.ipoib_gidsuff[0]), 5102 htonl(ace->ac_mac.ipoib_gidsuff[1])); 5103 5104 if (dofree) 5105 freemsg(mp); 5106 5107 return (GLD_SUCCESS); 5108 5109 ibd_send_fail: 5110 ibd_tx_cleanup(state, node, B_TRUE); 5111 return (ret); 5112 } 5113 5114 /* 5115 * GLD entry point for handling interrupts. When using combined CQ, 5116 * this handles Tx and Rx completions. With separate CQs, this handles 5117 * only Rx completions. 5118 */ 5119 static uint_t 5120 ibd_intr(gld_mac_info_t *macinfo) 5121 { 5122 ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; 5123 5124 /* 5125 * Poll for completed entries; the CQ will not interrupt any 5126 * more for incoming (or transmitted) packets. 5127 */ 5128 ibd_poll_compq(state, state->id_rcq_hdl); 5129 5130 /* 5131 * Now enable CQ notifications; all packets that arrive now 5132 * (or complete transmission) will cause new interrupts. 5133 */ 5134 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 5135 IBT_SUCCESS) { 5136 /* 5137 * We do not expect a failure here. 5138 */ 5139 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5140 } 5141 5142 /* 5143 * Repoll to catch all packets that might have arrived after 5144 * we finished the first poll loop and before interrupts got 5145 * armed. 5146 */ 5147 ibd_poll_compq(state, state->id_rcq_hdl); 5148 5149 return (DDI_INTR_CLAIMED); 5150 } 5151 5152 /* 5153 * Common code for interrupt handling as well as for polling 5154 * for all completed wqe's while detaching. 5155 */ 5156 static void 5157 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5158 { 5159 ibd_wqe_t *wqe; 5160 ibt_wc_t *wc, *wcs; 5161 uint_t numwcs; 5162 int i; 5163 5164 /* 5165 * In some cases (eg detaching), this code can be invoked on 5166 * any cpu after disabling cq notification (thus no concurrency 5167 * exists). Apart from that, the following applies normally: 5168 * The receive completion handling is always on the Rx interrupt 5169 * cpu. Transmit completion handling could be from any cpu if 5170 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5171 * is interrupt driven. Combined completion handling is always 5172 * on the interrupt cpu. Thus, lock accordingly and use the 5173 * proper completion array. 5174 */ 5175 if (cq_hdl == state->id_rcq_hdl) 5176 wcs = state->id_wcs; 5177 else 5178 wcs = state->id_txwcs; 5179 5180 while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) { 5181 5182 for (i = 0, wc = wcs; i < numwcs; i++, wc++) { 5183 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5184 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5185 (wqe->w_type == IBD_WQE_RECV)); 5186 if (wc->wc_status != IBT_WC_SUCCESS) { 5187 /* 5188 * Channel being torn down. 5189 */ 5190 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5191 DPRINT(5, "ibd_intr: flush error"); 5192 /* 5193 * Only invoke the Tx handler to 5194 * release possibly held resources 5195 * like AH refcount etc. Can not 5196 * invoke Rx handler because it might 5197 * try adding buffers to the Rx pool 5198 * when we are trying to deinitialize. 5199 */ 5200 if (wqe->w_type == IBD_WQE_RECV) 5201 continue; 5202 } else { 5203 DPRINT(10, "%s %d", 5204 "ibd_intr: Bad CQ status", 5205 wc->wc_status); 5206 } 5207 } 5208 if (wqe->w_type == IBD_WQE_SEND) 5209 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe), 5210 B_FALSE); 5211 else 5212 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5213 } 5214 } 5215 } 5216 5217 /* 5218 * Deregister the mr associated with a given mblk. 5219 */ 5220 static void 5221 ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) 5222 { 5223 int i; 5224 5225 DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, 5226 swqe->w_swr.wr_nds); 5227 /* 5228 * If this is an MDT case, process accordingly. 5229 */ 5230 if (swqe->w_mdtinfo != NULL) { 5231 ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo; 5232 5233 for (i = 0; i < mdinfo->ip_segs; i++) 5234 if ((mdinfo->ip_mhdl[i] != 0) && 5235 (ibt_deregister_mr(state->id_hca_hdl, 5236 mdinfo->ip_mhdl[i]) != IBT_SUCCESS)) 5237 DPRINT(10, "MDT deregistration failed\n"); 5238 ASSERT(!mdinfo->ip_copy); 5239 kmem_free(mdinfo, sizeof (ibd_mpack_t)); 5240 swqe->w_mdtinfo = NULL; 5241 return; 5242 } 5243 5244 for (i = 0; i < swqe->w_swr.wr_nds; i++) { 5245 if (ibt_deregister_mr(state->id_hca_hdl, 5246 swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { 5247 /* 5248 * We do not expect any errors here. 5249 */ 5250 DPRINT(10, "failed in ibt_deregister_mem()\n"); 5251 } 5252 } 5253 } 5254 5255 /* 5256 * Common code that deals with clean ups after a successful or 5257 * erroneous transmission attempt. 5258 */ 5259 static void 5260 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) 5261 { 5262 ibd_ace_t *ace = swqe->w_ahandle; 5263 5264 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5265 5266 /* 5267 * If this was a dynamic registration in ibd_send() or in MDT, 5268 * deregister now. 5269 */ 5270 if (swqe->swqe_im_mblk != NULL) { 5271 ibd_deregister_mr(state, swqe); 5272 freemsg(swqe->swqe_im_mblk); 5273 swqe->swqe_im_mblk = NULL; 5274 } 5275 5276 /* 5277 * Drop the reference count on the AH; it can be reused 5278 * now for a different destination if there are no more 5279 * posted sends that will use it. This can be eliminated 5280 * if we can always associate each Tx buffer with an AH. 5281 * The ace can be null if we are cleaning up from the 5282 * ibd_send() error path. 5283 */ 5284 if (ace != NULL) { 5285 /* 5286 * The recycling logic can be eliminated from here 5287 * and put into the async thread if we create another 5288 * list to hold ACE's for unjoined mcg's. 5289 */ 5290 if (DEC_REF_DO_CYCLE(ace)) { 5291 ibd_mce_t *mce; 5292 5293 /* 5294 * Check with the lock taken: we decremented 5295 * reference count without the lock, and some 5296 * transmitter might alreay have bumped the 5297 * reference count (possible in case of multicast 5298 * disable when we leave the AH on the active 5299 * list). If not still 0, get out, leaving the 5300 * recycle bit intact. 5301 * 5302 * Atomically transition the AH from active 5303 * to free list, and queue a work request to 5304 * leave the group and destroy the mce. No 5305 * transmitter can be looking at the AH or 5306 * the MCE in between, since we have the 5307 * ac_mutex lock. In the SendOnly reap case, 5308 * it is not neccesary to hold the ac_mutex 5309 * and recheck the ref count (since the AH was 5310 * taken off the active list), we just do it 5311 * to have uniform processing with the Full 5312 * reap case. 5313 */ 5314 mutex_enter(&state->id_ac_mutex); 5315 mce = ace->ac_mce; 5316 if (GET_REF_CYCLE(ace) == 0) { 5317 CLEAR_REFCYCLE(ace); 5318 /* 5319 * Identify the case of fullmember reap as 5320 * opposed to mcg trap reap. Also, port up 5321 * might set ac_mce to NULL to indicate Tx 5322 * cleanup should do no more than put the 5323 * AH in the free list (see ibd_async_link). 5324 */ 5325 if (mce != NULL) { 5326 ace->ac_mce = NULL; 5327 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 5328 /* 5329 * mc_req was initialized at mce 5330 * creation time. 5331 */ 5332 ibd_queue_work_slot(state, 5333 &mce->mc_req, ASYNC_REAP); 5334 } 5335 IBD_ACACHE_INSERT_FREE(state, ace); 5336 } 5337 mutex_exit(&state->id_ac_mutex); 5338 } 5339 } 5340 5341 /* 5342 * Release the send wqe for reuse. 5343 */ 5344 ibd_release_swqes(state, swqe, swqe, send_context); 5345 } 5346 5347 /* 5348 * Processing to be done after receipt of a packet; hand off to GLD 5349 * in the format expected by GLD. 5350 * The recvd packet has this format: 2b sap :: 00 :: data. 5351 */ 5352 static void 5353 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5354 { 5355 ipoib_pgrh_t *pgrh; 5356 mblk_t *mp; 5357 ipoib_hdr_t *ipibp; 5358 ip6_t *ip6h; 5359 int rxcnt, len; 5360 5361 /* 5362 * Track number handed to upper layer, and number still 5363 * available to receive packets. 5364 */ 5365 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5366 ASSERT(rxcnt >= 0); 5367 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5368 5369 /* 5370 * Adjust write pointer depending on how much data came in. 5371 */ 5372 mp = rwqe->rwqe_im_mblk; 5373 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5374 5375 /* 5376 * If the GRH is not valid, indicate to GLD by setting 5377 * the VerTcFlow field to 0. Else, update the pseudoGRH 5378 * so that GLD can determine the source mac of the packet. 5379 */ 5380 pgrh = (ipoib_pgrh_t *)mp->b_rptr; 5381 if (wc->wc_flags & IBT_WC_GRH_PRESENT) 5382 pgrh->ipoib_sqpn = htonl(wc->wc_qpn); 5383 else 5384 pgrh->ipoib_vertcflow = 0; 5385 5386 DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); 5387 5388 /* 5389 * For ND6 packets, padding is at the front of the source/target 5390 * lladdr. However the inet6 layer is not aware of it, hence remove 5391 * the padding from such packets. 5392 */ 5393 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5394 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5395 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5396 if (!pullupmsg(mp, IPV6_HDR_LEN + 5397 sizeof (ipoib_hdr_t))) { 5398 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5399 freemsg(mp); 5400 return; 5401 } 5402 } 5403 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5404 len = ntohs(ip6h->ip6_plen); 5405 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5406 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5407 IPV6_HDR_LEN + len) { 5408 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5409 IPV6_HDR_LEN + len)) { 5410 DPRINT(10, "ibd_process_rx: pullupmsg" 5411 " failed"); 5412 freemsg(mp); 5413 return; 5414 } 5415 } 5416 /* LINTED: E_CONSTANT_CONDITION */ 5417 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5418 } 5419 } 5420 5421 /* 5422 * Hand off to service thread/GLD. When we have hardware that 5423 * does hardware checksum, we will pull the checksum from the 5424 * work completion structure here. 5425 * on interrupt cpu. 5426 */ 5427 ibd_send_up(state, mp); 5428 5429 /* 5430 * Possibly replenish the Rx pool if needed. 5431 */ 5432 if (rxcnt < IBD_RX_THRESHOLD) { 5433 state->id_rx_short++; 5434 if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { 5435 if (ibd_post_rwqe(state, rwqe, B_FALSE) == 5436 DDI_FAILURE) { 5437 ibd_free_rwqe(state, rwqe); 5438 return; 5439 } 5440 } 5441 } 5442 } 5443 5444 /* 5445 * Callback code invoked from STREAMs when the recv data buffer is free 5446 * for recycling. 5447 */ 5448 static void 5449 ibd_freemsg_cb(char *arg) 5450 { 5451 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5452 ibd_state_t *state = rwqe->w_state; 5453 5454 /* 5455 * If the wqe is being destructed, do not attempt recycling. 5456 */ 5457 if (rwqe->w_freeing_wqe == B_TRUE) { 5458 DPRINT(6, "ibd_freemsg_cb: wqe being freed"); 5459 return; 5460 } 5461 5462 /* 5463 * Upper layer has released held mblk. 5464 */ 5465 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5466 5467 if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { 5468 /* 5469 * There are already enough buffers on the Rx ring. 5470 * Free this one up. 5471 */ 5472 rwqe->rwqe_im_mblk = NULL; 5473 ibd_delete_rwqe(state, rwqe); 5474 ibd_free_rwqe(state, rwqe); 5475 DPRINT(6, "ibd_freemsg_cb: free up wqe"); 5476 } else { 5477 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5478 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5479 if (rwqe->rwqe_im_mblk == NULL) { 5480 ibd_delete_rwqe(state, rwqe); 5481 ibd_free_rwqe(state, rwqe); 5482 DPRINT(6, "ibd_freemsg_cb: desballoc failed"); 5483 return; 5484 } 5485 5486 /* 5487 * Post back to h/w. We could actually have more than 5488 * id_num_rwqe WQEs on the list if there were multiple 5489 * ibd_freemsg_cb() calls outstanding (since the lock is 5490 * not held the entire time). This will start getting 5491 * corrected over subsequent ibd_freemsg_cb() calls. 5492 */ 5493 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5494 ibd_delete_rwqe(state, rwqe); 5495 ibd_free_rwqe(state, rwqe); 5496 return; 5497 } 5498 } 5499 } 5500 5501 #ifdef RUN_PERFORMANCE 5502 5503 /* 5504 * To run the performance test, first do the "ifconfig ibdN plumb" on 5505 * the Rx and Tx side. Then use mdb -kw to tweak the following variables: 5506 * ibd_performance=1. 5507 * ibd_receiver=1 on Rx side. 5508 * ibd_sender=1 on Tx side. 5509 * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update 5510 * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will 5511 * make it drop into a 1 minute loop waiting for packets. An 5512 * ifconfig/unplumb on the Tx will cause it to send packets to Rx. 5513 */ 5514 5515 #define IBD_NUM_UNSIGNAL ibd_num_unsignal 5516 #define IBD_TX_PKTSIZE ibd_tx_pktsize 5517 #define IBD_TX_DATASIZE ibd_tx_datasize 5518 5519 static ibd_swqe_t **swqes; 5520 static ibt_wc_t *wcs; 5521 5522 /* 5523 * Set these on Rx and Tx side to do performance run. 5524 */ 5525 static int ibd_performance = 0; 5526 static int ibd_receiver = 0; 5527 static int ibd_sender = 0; 5528 static ipoib_mac_t ibd_dest; 5529 5530 /* 5531 * Interrupt coalescing is achieved by asking for a completion intr 5532 * only every ibd_num_unsignal'th packet. 5533 */ 5534 static int ibd_num_unsignal = 8; 5535 5536 /* 5537 * How big is each packet? 5538 */ 5539 static int ibd_tx_pktsize = 2048; 5540 5541 /* 5542 * Total data size to be transmitted. 5543 */ 5544 static int ibd_tx_datasize = 512*1024*1024; 5545 5546 static volatile boolean_t cq_handler_ran = B_FALSE; 5547 static volatile int num_completions; 5548 5549 /* ARGSUSED */ 5550 static void 5551 ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) 5552 { 5553 ibd_state_t *state = (ibd_state_t *)arg; 5554 ibt_cq_hdl_t cqhdl; 5555 ibd_wqe_t *wqe; 5556 uint_t polled, i; 5557 boolean_t cq_enabled = B_FALSE; 5558 5559 if (ibd_receiver == 1) 5560 cqhdl = state->id_rcq_hdl; 5561 else 5562 cqhdl = state->id_scq_hdl; 5563 5564 /* 5565 * Mark the handler as having run and possibly freed up some 5566 * slots. Blocked sends can be retried. 5567 */ 5568 cq_handler_ran = B_TRUE; 5569 5570 repoll: 5571 while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == 5572 IBT_SUCCESS) { 5573 num_completions += polled; 5574 if (ibd_receiver == 1) { 5575 /* 5576 * We can immediately recycle the buffer. No 5577 * need to pass up to any IP layer ... 5578 */ 5579 for (i = 0; i < polled; i++) { 5580 wqe = (ibd_wqe_t *)wcs[i].wc_id; 5581 (void) ibt_post_recv(state->id_chnl_hdl, 5582 &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); 5583 } 5584 } 5585 } 5586 5587 /* 5588 * If we just repolled, we are done; exit. 5589 */ 5590 if (cq_enabled) 5591 return; 5592 5593 /* 5594 * Enable CQ. 5595 */ 5596 if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 5597 /* 5598 * We do not expect a failure here. 5599 */ 5600 cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); 5601 } 5602 cq_enabled = B_TRUE; 5603 5604 /* 5605 * Repoll for packets that came in after we finished previous 5606 * poll loop but before we turned on notifications. 5607 */ 5608 goto repoll; 5609 } 5610 5611 static void 5612 ibd_perf_tx(ibd_state_t *state) 5613 { 5614 ibt_mr_hdl_t mrhdl; 5615 ibt_mr_desc_t mrdesc; 5616 ibt_mr_attr_t mem_attr; 5617 ibt_status_t stat; 5618 ibd_ace_t *ace = NULL; 5619 ibd_swqe_t *node; 5620 uchar_t *sendbuf; 5621 longlong_t stime, etime; 5622 longlong_t sspin, espin, tspin = 0; 5623 int i, reps, packets; 5624 5625 cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", 5626 htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), 5627 htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), 5628 htonl(ibd_dest.ipoib_gidsuff[1])); 5629 if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || 5630 (ibd_dest.ipoib_gidpref[1] == 0)) { 5631 cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); 5632 return; 5633 } 5634 5635 packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); 5636 reps = (packets / IBD_NUM_SWQE); 5637 5638 cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); 5639 cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); 5640 cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); 5641 cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); 5642 cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); 5643 if ((packets % IBD_NUM_UNSIGNAL) != 0) { 5644 /* 5645 * This is required to ensure the last packet will trigger 5646 * a CQ handler callback, thus we can spin waiting fot all 5647 * packets to be received. 5648 */ 5649 cmn_err(CE_CONT, 5650 "ibd_perf_tx: #Packets not multiple of Signal Grp size"); 5651 return; 5652 } 5653 num_completions = 0; 5654 5655 swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, 5656 KM_NOSLEEP); 5657 if (swqes == NULL) { 5658 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5659 return; 5660 } 5661 5662 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5663 if (wcs == NULL) { 5664 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5665 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5666 return; 5667 } 5668 5669 /* 5670 * Get the ud_dest for the destination. 5671 */ 5672 ibd_async_acache(state, &ibd_dest); 5673 mutex_enter(&state->id_ac_mutex); 5674 ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); 5675 mutex_exit(&state->id_ac_mutex); 5676 if (ace == NULL) { 5677 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5678 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5679 cmn_err(CE_CONT, "ibd_perf_tx: no AH"); 5680 return; 5681 } 5682 5683 /* 5684 * Set up the send buffer. 5685 */ 5686 sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); 5687 if (sendbuf == NULL) { 5688 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5689 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5690 cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); 5691 return; 5692 } 5693 5694 /* 5695 * This buffer can be used in the case when we want to 5696 * send data from the same memory area over and over; 5697 * it might help in reducing memory traffic. 5698 */ 5699 mem_attr.mr_vaddr = (uint64_t)sendbuf; 5700 mem_attr.mr_len = IBD_TX_PKTSIZE; 5701 mem_attr.mr_as = NULL; 5702 mem_attr.mr_flags = IBT_MR_NOSLEEP; 5703 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 5704 &mrhdl, &mrdesc) != IBT_SUCCESS) { 5705 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5706 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5707 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5708 cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); 5709 return; 5710 } 5711 5712 /* 5713 * Allocate private send wqe's. 5714 */ 5715 for (i = 0; i < IBD_NUM_SWQE; i++) { 5716 if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { 5717 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5718 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5719 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5720 cmn_err(CE_CONT, "ibd_alloc_swqe failure"); 5721 return; 5722 } 5723 node->w_ahandle = ace; 5724 #if 0 5725 node->w_smblkbuf[0].im_mr_hdl = mrhdl; 5726 node->w_smblkbuf[0].im_mr_desc = mrdesc; 5727 node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; 5728 node->w_smblk_sgl[0].ds_key = 5729 node->w_smblkbuf[0].im_mr_desc.md_lkey; 5730 node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; 5731 node->w_swr.wr_sgl = node->w_smblk_sgl; 5732 #else 5733 node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; 5734 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5735 #endif 5736 5737 /* 5738 * The last of IBD_NUM_UNSIGNAL consecutive posted WRs 5739 * is marked to invoke the CQ handler. That is the only 5740 * way we come to know when the send queue can accept more 5741 * WRs. 5742 */ 5743 if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) 5744 node->w_swr.wr_flags = IBT_WR_NO_FLAGS; 5745 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5746 node->w_swr.wr_nds = 1; 5747 5748 swqes[i] = node; 5749 } 5750 5751 ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); 5752 5753 /* 5754 * Post all the requests. We expect this stream of post's will 5755 * not overwhelm the hardware due to periodic completions and 5756 * pollings that happen out of ibd_perf_handler. 5757 * Post a set of requests, till the channel can accept; after 5758 * that, wait for the CQ handler to notify us that there is more 5759 * space. 5760 */ 5761 stime = gethrtime(); 5762 for (; reps > 0; reps--) 5763 for (i = 0; i < IBD_NUM_SWQE; i++) { 5764 node = swqes[i]; 5765 retry: 5766 if ((stat = ibt_post_send(state->id_chnl_hdl, 5767 &node->w_swr, 1, NULL)) != IBT_SUCCESS) { 5768 if (stat == IBT_CHAN_FULL) { 5769 /* 5770 * Spin till the CQ handler runs 5771 * and then try again. 5772 */ 5773 sspin = gethrtime(); 5774 while (!cq_handler_ran) 5775 ; 5776 espin = gethrtime(); 5777 tspin += (espin - sspin); 5778 cq_handler_ran = B_FALSE; 5779 goto retry; 5780 } 5781 cmn_err(CE_CONT, "post failure %d/%d", stat, i); 5782 goto done; 5783 } 5784 } 5785 5786 done: 5787 /* 5788 * We should really be snapshotting when we get the last 5789 * completion. 5790 */ 5791 while (num_completions != (packets / IBD_NUM_UNSIGNAL)) 5792 ; 5793 etime = gethrtime(); 5794 5795 cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", 5796 num_completions); 5797 cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); 5798 cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); 5799 5800 /* 5801 * Wait a sec for everything to get over. 5802 */ 5803 delay(drv_usectohz(2000000)); 5804 5805 /* 5806 * Reset CQ handler to real one; free resources. 5807 */ 5808 if (ibd_separate_cqs == 0) { 5809 ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); 5810 } else { 5811 if (ibd_txcomp_poll == 0) 5812 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, 5813 state); 5814 else 5815 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5816 } 5817 5818 for (i = 0; i < IBD_NUM_SWQE; i++) 5819 ibd_free_swqe(state, swqes[i]); 5820 (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); 5821 kmem_free(sendbuf, IBD_TX_PKTSIZE); 5822 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5823 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5824 } 5825 5826 static void 5827 ibd_perf_rx(ibd_state_t *state) 5828 { 5829 wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); 5830 if (wcs == NULL) { 5831 kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); 5832 cmn_err(CE_CONT, "ibd_perf_tx: no storage"); 5833 return; 5834 } 5835 5836 /* 5837 * We do not need to allocate private recv wqe's. We will 5838 * just use the regular ones. 5839 */ 5840 5841 num_completions = 0; 5842 ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); 5843 5844 /* 5845 * Delay for a minute for all the packets to come in from 5846 * transmitter. 5847 */ 5848 cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); 5849 delay(drv_usectohz(60000000)); 5850 cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); 5851 5852 /* 5853 * Reset CQ handler to real one; free resources. 5854 */ 5855 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5856 kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); 5857 } 5858 5859 static void 5860 ibd_perf(ibd_state_t *state) 5861 { 5862 if (ibd_performance == 0) 5863 return; 5864 5865 if (ibd_receiver == 1) { 5866 ibd_perf_rx(state); 5867 return; 5868 } 5869 5870 if (ibd_sender == 1) { 5871 ibd_perf_tx(state); 5872 return; 5873 } 5874 } 5875 5876 #endif /* RUN_PERFORMANCE */ 5877